From 0098878d9c13a7f720d29edaf0976ea12ada0102 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@vectorware.com>
Date: Fri, 7 Nov 2025 13:57:14 +1100
Subject: [PATCH 1/4] Improve and rename `inbounds!` macro.

It has two versions, one with an upper bound, and one with a lower and
upper bound. This commit removes the first one and changes the second
one to take a range, because that is more concise and flexible and
clearer.

Also, rename it as `in_range`, which makes sense given that the bounds
are specified via a Rust `Range`.

Note: some of the ranges are incorrect, and will be fixed in the next
commit.
---
 crates/cuda_std/src/thread.rs | 49 ++++++++++++++---------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index 449cc972..e2426001 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -89,26 +89,15 @@ extern "C" {
 }
 
 #[cfg(target_os = "cuda")]
-macro_rules! inbounds {
-    // the bounds were taken mostly from the cuda C++ programming guide, i also
-    // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata
-    ($func_name:ident, $bound:expr) => {{
+macro_rules! in_range {
+    // The bounds were taken mostly from the cuda C++ programming guide. I also
+    // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata.
+    ($func_name:ident, $range:expr) => {{
         let val = unsafe { $func_name() };
-        if val > $bound {
-            // SAFETY: this condition is declared unreachable by compute capability max bound
+        if !$range.contains(&val) {
+            // SAFETY: this condition is declared unreachable by compute capability max bound.
             // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-            // we do this to potentially allow for better optimizations by LLVM
-            unsafe { core::hint::unreachable_unchecked() }
-        } else {
-            val
-        }
-    }};
-    ($func_name:ident, $lower_bound:expr, $upper_bound:expr) => {{
-        let val = unsafe { $func_name() };
-        if !($lower_bound..=$upper_bound).contains(&val) {
-            // SAFETY: this condition is declared unreachable by compute capability max bound
-            // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-            // we do this to potentially allow for better optimizations by LLVM
+            // We do this to potentially allow for better optimizations by LLVM.
             unsafe { core::hint::unreachable_unchecked() }
         } else {
             val
@@ -119,73 +108,73 @@ macro_rules! inbounds {
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_x() -> u32 {
-    inbounds!(__nvvm_thread_idx_x, 1024)
+    in_range!(__nvvm_thread_idx_x, 0..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_y() -> u32 {
-    inbounds!(__nvvm_thread_idx_y, 1024)
+    in_range!(__nvvm_thread_idx_y, 0..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_z() -> u32 {
-    inbounds!(__nvvm_thread_idx_z, 64)
+    in_range!(__nvvm_thread_idx_z, 0..=64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_x() -> u32 {
-    inbounds!(__nvvm_block_idx_x, 2147483647)
+    in_range!(__nvvm_block_idx_x, 0..=2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_y() -> u32 {
-    inbounds!(__nvvm_block_idx_y, 65535)
+    in_range!(__nvvm_block_idx_y, 0..=65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_z() -> u32 {
-    inbounds!(__nvvm_block_idx_z, 65535)
+    in_range!(__nvvm_block_idx_z, 0..=65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_x() -> u32 {
-    inbounds!(__nvvm_block_dim_x, 1, 1025)
+    in_range!(__nvvm_block_dim_x, 1..=1025)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_y() -> u32 {
-    inbounds!(__nvvm_block_dim_y, 1, 1025)
+    in_range!(__nvvm_block_dim_y, 1..=1025)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_z() -> u32 {
-    inbounds!(__nvvm_block_dim_z, 1, 65)
+    in_range!(__nvvm_block_dim_z, 1..=65)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_x() -> u32 {
-    inbounds!(__nvvm_grid_dim_x, 1, 2147483648)
+    in_range!(__nvvm_grid_dim_x, 1..=2147483648)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_y() -> u32 {
-    inbounds!(__nvvm_grid_dim_y, 1, 65536)
+    in_range!(__nvvm_grid_dim_y, 1..=65536)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_z() -> u32 {
-    inbounds!(__nvvm_grid_dim_z, 1, 65536)
+    in_range!(__nvvm_grid_dim_z, 1..=65536)
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.

From 162e738db5f52c20e686d6892019c045cf00578b Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@vectorware.com>
Date: Fri, 7 Nov 2025 15:19:31 +1100
Subject: [PATCH 2/4] Fix and document `in_bounds!` usage points.

Every single one has an upper bound that is one higher than it should
be.

- For `thread_idx_[xyz]`: indices are 0-indexed, so the maximum index is
  the `block_dim_[xyz]` maximum minus one. Changing `..=` to `..` fixes
  it.

- For `block_idx_[xyz]`: likewise, but relative to `grid_dim_[xyz]`.

- For `block_dim_[xyz]`: these were all one too big. Not sure why,
  perhaps a `..`/`..=` mix-up?

- For `grid_dim_[xyz]`: likewise. (Yes, these grid maximum dimensions
  are all of the form 2^N-1 even though the block maximum dimensions are
  all of the form 2^N. I don't know why, but it's what the CUDA docs
  say.)
---
 crates/cuda_std/src/thread.rs | 36 +++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index e2426001..85f9df26 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -108,73 +108,85 @@ macro_rules! in_range {
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_x() -> u32 {
-    in_range!(__nvvm_thread_idx_x, 0..=1024)
+    // The range is derived from the `block_idx_x` range.
+    in_range!(__nvvm_thread_idx_x, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_y() -> u32 {
-    in_range!(__nvvm_thread_idx_y, 0..=1024)
+    // The range is derived from the `block_idx_y` range.
+    in_range!(__nvvm_thread_idx_y, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_z() -> u32 {
-    in_range!(__nvvm_thread_idx_z, 0..=64)
+    // The range is derived from the `block_idx_z` range.
+    in_range!(__nvvm_thread_idx_z, 0..64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_x() -> u32 {
-    in_range!(__nvvm_block_idx_x, 0..=2147483647)
+    // The range is derived from the `grid_idx_x` range.
+    in_range!(__nvvm_block_idx_x, 0..2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_y() -> u32 {
-    in_range!(__nvvm_block_idx_y, 0..=65535)
+    // The range is derived from the `grid_idx_y` range.
+    in_range!(__nvvm_block_idx_y, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_z() -> u32 {
-    in_range!(__nvvm_block_idx_z, 0..=65535)
+    // The range is derived from the `grid_idx_z` range.
+    in_range!(__nvvm_block_idx_z, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_x() -> u32 {
-    in_range!(__nvvm_block_dim_x, 1..=1025)
+    // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
+    in_range!(__nvvm_block_dim_x, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_y() -> u32 {
-    in_range!(__nvvm_block_dim_y, 1..=1025)
+    // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
+    in_range!(__nvvm_block_dim_y, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_z() -> u32 {
-    in_range!(__nvvm_block_dim_z, 1..=65)
+    // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
+    in_range!(__nvvm_block_dim_z, 1..=64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_x() -> u32 {
-    in_range!(__nvvm_grid_dim_x, 1..=2147483648)
+    // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
+    in_range!(__nvvm_grid_dim_x, 1..=2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_y() -> u32 {
-    in_range!(__nvvm_grid_dim_y, 1..=65536)
+    // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
+    in_range!(__nvvm_grid_dim_y, 1..=65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_z() -> u32 {
-    in_range!(__nvvm_grid_dim_z, 1..=65536)
+    // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
+    in_range!(__nvvm_grid_dim_z, 1..=65535)
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.

From 9b1eff72c8926a5e7a08540c9243a2effdc83495 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@vectorware.com>
Date: Fri, 7 Nov 2025 15:50:54 +1100
Subject: [PATCH 3/4] Don't call intrinsics in 3d `dim`/`idx` functions.

Instead call the Rust functions that have the range constraints. That
way the 3d version get the same range constraints as the 1d versions. It
also avoids the need for some `unsafe` blocks.
---
 crates/cuda_std/src/thread.rs | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index 85f9df26..44bcc5c5 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -193,26 +193,14 @@ pub fn grid_dim_z() -> u32 {
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx() -> UVec3 {
-    unsafe {
-        UVec3::new(
-            __nvvm_thread_idx_x(),
-            __nvvm_thread_idx_y(),
-            __nvvm_thread_idx_z(),
-        )
-    }
+    UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z())
 }
 
 /// Gets the 3d index of the block that the thread currently executing the kernel is located in.
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx() -> UVec3 {
-    unsafe {
-        UVec3::new(
-            __nvvm_block_idx_x(),
-            __nvvm_block_idx_y(),
-            __nvvm_block_idx_z(),
-        )
-    }
+    UVec3::new(block_idx_x(), block_idx_y(), block_idx_z())
 }
 
 /// Gets the 3d layout of the thread blocks executing this kernel. In other words,
@@ -220,13 +208,7 @@ pub fn block_idx() -> UVec3 {
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim() -> UVec3 {
-    unsafe {
-        UVec3::new(
-            __nvvm_block_dim_x(),
-            __nvvm_block_dim_y(),
-            __nvvm_block_dim_z(),
-        )
-    }
+    UVec3::new(block_dim_x(), block_dim_y(), block_dim_z())
 }
 
 /// Gets the 3d layout of the block grids executing this kernel. In other words,
@@ -234,13 +216,7 @@ pub fn block_dim() -> UVec3 {
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim() -> UVec3 {
-    unsafe {
-        UVec3::new(
-            __nvvm_grid_dim_x(),
-            __nvvm_grid_dim_y(),
-            __nvvm_grid_dim_z(),
-        )
-    }
+    UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z())
 }
 
 /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This

From bc1ad2c0e93bbbb90c7d5a2b255695cc9b8514de Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@vectorware.com>
Date: Fri, 7 Nov 2025 16:04:52 +1100
Subject: [PATCH 4/4] Remove `__nvvm_{thread,block,grid}_{idx,dim}_[xyz]`
 intrinsics.

`core` has equivalents, might as well use them instead.
---
 crates/cuda_std/src/lib.rs                 |   7 +-
 crates/cuda_std/src/thread.rs              |  44 ++++------
 crates/rustc_codegen_nvvm/libintrinsics.bc | Bin 8388 -> 7768 bytes
 crates/rustc_codegen_nvvm/libintrinsics.ll |  92 ---------------------
 4 files changed, 20 insertions(+), 123 deletions(-)

diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
index 752c07f1..8aef4d74 100644
--- a/crates/cuda_std/src/lib.rs
+++ b/crates/cuda_std/src/lib.rs
@@ -24,7 +24,12 @@
 #![allow(internal_features)]
 #![cfg_attr(
     target_os = "cuda",
-    feature(alloc_error_handler, asm_experimental_arch, link_llvm_intrinsics)
+    feature(
+        alloc_error_handler,
+        asm_experimental_arch,
+        link_llvm_intrinsics,
+        stdarch_nvptx
+    )
 )]
 
 extern crate alloc;
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index 44bcc5c5..42edbecc 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -63,22 +63,6 @@ use glam::{UVec2, UVec3};
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 extern "C" {
     // defined in libintrinsics.ll
-    fn __nvvm_thread_idx_x() -> u32;
-    fn __nvvm_thread_idx_y() -> u32;
-    fn __nvvm_thread_idx_z() -> u32;
-
-    fn __nvvm_block_dim_x() -> u32;
-    fn __nvvm_block_dim_y() -> u32;
-    fn __nvvm_block_dim_z() -> u32;
-
-    fn __nvvm_block_idx_x() -> u32;
-    fn __nvvm_block_idx_y() -> u32;
-    fn __nvvm_block_idx_z() -> u32;
-
-    fn __nvvm_grid_dim_x() -> u32;
-    fn __nvvm_grid_dim_y() -> u32;
-    fn __nvvm_grid_dim_z() -> u32;
-
     fn __nvvm_warp_size() -> u32;
 
     fn __nvvm_block_barrier();
@@ -92,8 +76,8 @@ extern "C" {
 macro_rules! in_range {
     // The bounds were taken mostly from the cuda C++ programming guide. I also
     // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata.
-    ($func_name:ident, $range:expr) => {{
-        let val = unsafe { $func_name() };
+    ($func_name:path, $range:expr) => {{
+        let val = unsafe { $func_name() as u32 };
         if !$range.contains(&val) {
             // SAFETY: this condition is declared unreachable by compute capability max bound.
             // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
@@ -109,84 +93,84 @@ macro_rules! in_range {
 #[inline(always)]
 pub fn thread_idx_x() -> u32 {
     // The range is derived from the `block_idx_x` range.
-    in_range!(__nvvm_thread_idx_x, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_y() -> u32 {
     // The range is derived from the `block_idx_y` range.
-    in_range!(__nvvm_thread_idx_y, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_z() -> u32 {
     // The range is derived from the `block_idx_z` range.
-    in_range!(__nvvm_thread_idx_z, 0..64)
+    in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_x() -> u32 {
     // The range is derived from the `grid_idx_x` range.
-    in_range!(__nvvm_block_idx_x, 0..2147483647)
+    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_y() -> u32 {
     // The range is derived from the `grid_idx_y` range.
-    in_range!(__nvvm_block_idx_y, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_z() -> u32 {
     // The range is derived from the `grid_idx_z` range.
-    in_range!(__nvvm_block_idx_z, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_x() -> u32 {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(__nvvm_block_dim_x, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_y() -> u32 {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(__nvvm_block_dim_y, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_z() -> u32 {
     // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
-    in_range!(__nvvm_block_dim_z, 1..=64)
+    in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_x() -> u32 {
     // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
-    in_range!(__nvvm_grid_dim_x, 1..=2147483647)
+    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_y() -> u32 {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(__nvvm_grid_dim_y, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_z() -> u32 {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(__nvvm_grid_dim_z, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.
diff --git a/crates/rustc_codegen_nvvm/libintrinsics.bc b/crates/rustc_codegen_nvvm/libintrinsics.bc
index c22e92db132d0ac424c4695c5f309a0d7c924b5e..cbce93e225e3c7456f466fded314a7d26a479cda 100644
GIT binary patch
literal 7768
zcmb`L4^Wfm9mk)SyyOK5;pNY*p$3`&bF`;v083HC3j|$g-40#1;o2?<1WeX|F$v+1
zw!Zw4M5S%8wqvV1v7@)zT{~Lst?qig@TZkp?4jFPd-a@5bk(ySv)$HhZMWa^=KVoJ
z1+zQfGw=6#zR&0Ry}$SQ7fFRNbpE~!LgEP_wF-S+)#kS^U;l9Jib|%CT12ZLq<|$P
zDNak`pq>sNt|Pave4MS4^M$MSG2M!9$0uZFvwLd=)w;NfeMuE#d|^gOg6hE~g|z`w
zFhgTpA9#^u7FMuT%3x~lSatcvj^3%ROMBRY_417>Lg4XaY43&q!o?ZLRZExlOIju-
z@<xa1Ybfm`-O2L5QBO7`7mjx1ht0v{p0LR?r{&XM&~_$Bmb@f`cvTRWzCe1Xr9!hI
zp(l|k(B>~=-|fCY$ZicGInY*^6G+t9cCqQXgWYhu*nKFNk+z|?H>rDU1Akyx&8YU1
z0Si08%2GY-xVqfJUd)%(s`gK$8rIi5Rj7Zz;-s3%SL7x<ojUMqMJn6(kUXI?aF2{-
zY?+LHPX`0f*q_Oi?0Jor#-3$l<i?31hIG`21pio}Gi01LKm$o%)-uGk4sbh$WUKA6
zmLb1a<M$@`i&g%1jo)>aPrEcJpBdnM2Cm)0dBs9wP`Pb@_Gz7zUlF)=@gFrUTszPC
z1~@1{)uZ9s^_+Wvg922eB3PnZE%RDaqC#0xqC#0xqC=%@)6it*?vwKCVj8p;%{g@>
zc(Ri<6|FPnYCBg}#tjl8&{wCIpxu`TQw57Gd@1_I4NC|~z`82uqjFdxJl6G<t9RuS
zqQSZ)(4DlA5Chgj{M}gtQSmpe^)kG;d_~5X_xvYunz;w|m^`QA4Ra4v^jSuegt-T*
za*p}$AdE)r3^#pcZR))w&HzR6vIGW~nF%Y16}UkHKYSa)(OIXLpiTS3j;I6%pJBS9
zfYTod<%AUg7OxP0#T(L)ID=S&n<d~~YvBo_fwFkPAZYi1FI53JYD{C;*=L6M!%ybK
z5n(QeeRpkSd2vsOeT;dVe+fe@#{fO1ODj_i;$9RjMc<4AHVJH_@TJOmC1V|7s<E!G
z{P*EqQ3(wGD3M`j9~j~tuO46{XL*yoF_|YE1~BHGcc+oLS?Muded9g@QDeKJ?}3S`
zsAX{S>>0wiuw7s2F8U5bG+4I;#<n~xV?+4HFotL5C2(wB0zpr70=nNvCvaN1FDijc
zS$Pc1qjc#jA1E`$MJBL+dyr)f*bd9!D@|h52z*_h29t<@aWF$M4^YbIMM##-dxW2v
zm%}?`UXrz>$}qRM6@8}*?66(v;VoW0rhlk38Ift^?Nq3jMCO_Hpgb`?@<k3W>*|Sm
z)4b=Him0L3^nU12)L2c`2dksfs3@A3#;ck0()d`RPKrvSO};xijWXHss5FiqEU%49
zBj;4;wWu`Svj=OU((tY9FN|9DO?N(`<_Z1I5HI84reu8z*68Nl^gFp)MHoGL`pOd@
zlJwaQVE32`mbz?&4Iv_o$va+9v4p%v_m3AoMG1m>Y)ajb!ITAzCEcUF6$01R%6a;?
zDK3+gPkH6%g0e_akv%e;^}!)^gFarwx|UAJ%Epaly|S{2)?TlR+n&I+nfxvn*Y1+}
zTKz7KKXW2gc`c}%(x)`(Q!dUZXR4J&{8-j`t@^AsYs8w}px<5kR#)kyan(`Tnzy>v
zbjwO7W-@#hZo8ek-|lzAUGTetewT&w<#WC-;vsB{;5R4ux2gQ@et%nk$MWkl%Ig=D
ze-A0I>1DO!ht$)D)Dz<UoK<~7s~+pfI$@Qc2^v?88&~y<_cupN_8UtAveNyd_rgQJ
z>@5A0taNBpaopi|)%e}6F0Y+ywfMa%zjwf&`N0oTu9K7-)ynCh@}ePSN~636Z=uQ0
zIyam>Ql5QUt3KC}eeQyKdMfKGlii@&STbZ>eW0s!qB>7{5sSa2m22(axZ(3)3LSy$
zpJku0sz1?YogdD6g2|={YqMso@`rYHl}wD54jIc}K)xJVGi@vlKy|eDoh9(5{uan5
zfosp_-2EMoT;!G4Ldr|k%Fpz<!>7x$r>E{XJC${w$)>Y>7$!CeG--)DAI6JouW|Jo
z%iy(#jH@O`J1?u>J(T=OIi1eS+N^LM<P;no^t|OJ)(m!)9)Q7-l?`;Qfq@E+(nG<j
z2`}Arb~wN=f=0#e43?LKzqq(+Q#$`WfZ(!f=5tTP$UPn-w<AVwdyHIrj9go69N!{L
z8Nx@}qT|9h$H;kN<hI>{8wt-FzjZ*RWW^(Qetu~%f9Y)ba{4Vo=e&D@ko%XrM{{!J
za?)v;9Pi+Jm$4yxCx5^aa*n2wBgw<OjfZGAf8Sfh3*F{;Z{Fs3L$^8JOSd`R!2cO9
zQJml5KnLBwraBFq*)5~<Ouv5TuZ3vVx+n|(RfBbdaSw`1;$|_UxnIS|Js%_Yi`cm8
zpU22O8zc8jj9g!g+|#jfHBZLK?TwM!y&#t=&TquIaFUQ*c_JJMp%v@P6(jZv{t?TV
zeS}ODXE6GYc}2OL9)UMqSM+R&I%C9G^mrlicQtUl49BnUU8V>*jdRD#&DPSwaSZaY
z7~^cy5CZ>5p_0$(CKp-N<oAWRf3hWfjlk}Wuc_WF$PdPw`hFsO>qvay)WgD^$K%IS
z9~H7cj!(GpQz55IZgM+>WxtgNZagO3^GEskUm6A76?sC@R-y1KxyjQi6!&p~BW=R!
z7rF7z+J&{pxP+BEgo=}#$@{pleu@iJ?i3#Ap}T^RdiX8l&#)QOWcz36mpI**ufSiP
z^ml?vsa_3AFRB~h<C6Yfc^x47+s8Yg$)KlcC+VUbUi3Vs7ghR$1j!4RX&pc4LzC#C
zqF)R?T@R?}e^>HR(XWwwRP-I-(>X*%zYV-B_@GihVL2hsNj@t2M<gE={YmiYIHIC|
zUGh=USHOT%9u@t2z`qN8RP?JP9~FJO<fEeR2VVnmP|^R5<fEd0T=G%T|EuJqqR+u5
zp}$q3QlHN6Qt;{Aq9VTnd^(S)$gh>+pd!Bse7eR^k*}A0RODMF9~Jq%QhZe84}wqs
z&x(rt%iz=RB2?s$fUg7}75QQCmx7Oq{5gq7MgFS9qaq(K3&%%Ap8jn`<D(+4lXz6*
z*GN1n@;{b%ROIU=9u;{(;!%-*O5#zGKMX$IU#Q3*lYCT>hjsFv<f9`0k>sNye?{_9
zk^h(EqavTkhVwy1K2!2hk-uB=QITII`KZWONj@s_n<XC=c?bCPtVBiL2R=QKQK=uu
zBjgG2>3Ty&z88GDK2ec>7JM2X75Nvyr}0sde_4u;iu{{Wd{pFzrTD1Ge;~z2MgEKw
z9~Jp&DLyLlpGrI`^8b-|RODIs&Y|OjihLUQbbL^eUkW}wPf(H9fqw`1sMM$D|2pvL
z?-i)XKMFqmSq>HXHi<_?zE9#&k%tiEQ3WBbaK6HK@auGso6Su=U!%F!xwY;wv%}VE
zZvBSS_6?_9a<;e~HnYvqINNE~z&^7Ec4@J?U1pD?z252cHR|ZIb?$nrP3Q8q>OAiH
zEjq8mrfXfSv2C%&_Jtao7L6;mE*MyBT`+Q;*Sc8B;pPQnhnp7+-gJxTQu~78rS=8m
z)9IkI5jDlNR=3+x@17e<$<RkmdqaIwUBo0d);9vBgLEaxR_}Av)thgQZE;GJr_JN7
zZ(M+aMhWw}tu8SgT97<PiP>tiZ53?~lyof)uU)s*SMP3cZf(&y3W_DLIRb2641|cD
z=2|HtlopFP3pOtXLd3>qrxX!Ni$$CTn->Ee1%=_)F`M1Z9<Mo~E^g%JdF>H(v0db8
z@{#?+^|}5L?PC9kcCkNAHL`!WKG#2@UF;vxZgcp&jv_Oh8et80!df#HM7wdTO=MBS
zEY_N_5N4a5VOCPnw0i7ecea4GTZF+57ew2Mwb_Dbhfgefb3=nOV!eCp4Nje>t*K7u
zC@Pc|nn*TUy>)i8)#)@B70&G-(ba)_i$29G7w%KM@_t=`PT%B%v$A;QiYA}K=CH1)
XYqnXPj#@{P*X?NXIO;s~lqckWHkoW=

literal 8388
zcmb`L3ve678OK-FS*OIZEIS^MiR?<Y8$>5DiXDuD<04y5s+2I}1cos*L6TpQsO%_`
z{0xtiZ6d|EAO|xw;7nyOj594A7gCyGC?h`zY3eX(nnEaPCMJ&>CZypx4205ePy5+A
z6B~6VJF)(||NZTE-`(!*U9Q3sJ$*|iA<2Z08kH%3<GpWOxb*H%S5?XisYMJbLV7tu
zQj!cL3EIoxj}PSal^^CdDl~;__sY6eKTcL=<#3PH2-@{Y6?;=ECN+hbB}(lbOABj4
z)^MiYvLW;W$ttYiHmbwvd6QM;n*+VmUFUna)f*I>wS>Uo$@1Pa-$CKc<Rg2R;R>x2
zEBRwU`(LO%Lb_8GpHNRest^tYj4^XKb$86v$c`({eMS3aVX|~Y4)HcZTzUe_I_(wu
zP0HO%WUCFv72KijGlcBX6Os#kg}I?6`npHCWqHHh@N>3%D4dzGskb+!dvcRza7-uD
z_LGR6i*WLEA9q++Zs*P#<u%&=sdV#(>c<OBf2}yGlNnWc$|uqz&#Th8zPlC5&d^Wf
zoUATOX6gyZ;28Z`vXY)Z(AwlvGCBG5$f%42>Z3w%vd|s1oHIiQS+=5ely8gh501)j
zbIMysgMNL`uM8GzgB|*y=cFd%{4vGl2;XAnJM6q)tTcwzJ0o;V+cCw50^cG2qo$qj
z(C{r09xBlE>G=*5?~U+Kfu<yaUD|7(*PA8@WlxiYvZqNyrR~(yWafU4DK3d=&`~t!
z^zrb~PR?4i-kN9VTvM6!A|V1jby@_&@ij12u*hOX)i-5dN{ABM+T8O2SRxv1n=0>1
zzKIY$w(X%4DO(6JV>_z(Fnd@Me~Z-$Ge9RnESBM?!?2d=au&;OYkj&jfs47)1omx^
zCNQ3AlqArdvyE_q7z}=*n*EipLH>9^!LWD0FVoB&&@z1cCrSFbGv00e^9RZ1xiePv
zefegJFn7jq1*WbiGCjl=i>Y$_*_%m{S!}?MJ>)O75l(=rSfZN0JRH@NB(vCpAA6|I
zV%HFv9_nH-Rlb~OQc4c-UWS&F<Zy8h+$3-t#fmm}e}5&BnXzrE{9u0wB4Zn7;hszx
zXU3C`YHm8PB8dpt77Q7UcXAr>rWD6mhCh1-=2(FNc3Zz*pKca61JP3TZFL(Y37oI4
zATm4hrpk<>8)Za~ZF`8@{tG!5#REn)u73=2@yl$>?;BG!gvSAv{0?7+WEtfC=@v85
zVZW-c<L!--b@<Z1-XSs%_M0jT%cKcZ?vf_(;kWY=_-<YTI+cD&e1;h}D8d}f0xMC?
zk~FzBDgGS0dt~`CSWB$HN_dm(l_WV-OE#M4)|RU8@@gSpa+=00a|SF1jA6+ioS(*h
z^V7I_ej3lrPvdB0NRq}=+l4Ag8viWvz)cf((|wbbpD9TrBlU)*@mFI@uR@s|e>HCU
znLH@D8vYl{D<oH=^Y8Q0_`)?Wji2A*St&{5l^gP}lcaHfsxBoyjlHia_DIrj-7kMj
zl1Aps<u#Htw!CLQB3Zg$xIH_~1fHd_qUxI|j;_NN=x*!pmV4}ki=qcp|B!ss>;SN}
zxdBhK52jv@OvZAq=hbw0veXdb2eU{i=q9Ii4VkjCkfo%1qPIfe+uL~Gz)sZ#lJ>Di
z{kfnnQdQ)Pk7d6-q-!uGi&)q4DS6ppOIfeHY^tr-@8NeT`F3m2<Ka6z@|L!sM<2|Z
zN>^VDtEWwAO{TQ7m(`c6)J2-f?9&F_NkjIyBd5W%r}Xu%(qopj2juHs?^@R_FP*xa
z*<$B+Ir&?iK`;CYK~Fg7vGXlPzU8ZA2-_+IZOY(IZO}UqY##`$ymVQ8>5Te|sQRKw
zUUPUzcWy{GCH|gr=#Ch4lY#6b4#hiR%i6=1wFBbs!HJT7OG!vx+COnK9P~<O=?Qu1
z=!EJmSI|=(^m@AdPQJ|^^lO9uNHFW|yV5R^v`?$l=fdi<=Co<O`XZc#-k5!AEN8qt
z=eR+4Dv)#PjPBfY_D8au2JPmOQOj+EU8Pf1`RqjO!PYjuZD8}J&%<f-3gkSHbHt&W
zF=U?}%YImvLlZV+Uv?<&ezdD(YNB-1QU({~%J{l-meLS3Cwkvp3TGN@g?yBJhmrRV
z1h$^ls4qs<=d0A8nexVtm*<?DzW(HN_GwuTo#Xwm`KExTEcF`Udhs3Ao&if4oc5??
z?Xii@3%Wx?sWattIxiTqV|kFj!PY>xRDNpRa98OdTpW2>q-z~qsPF{6;~o0gUAl}L
z3&~`HUdz1@E-#5aoA@g0GR>U;L2>BkbHfR8&nL(|mmoKkAQwrHdn7^b;RLyd6679C
zklU3Y*O4ICo*1WT6{gLxjd9U+plME!^Cigb`~hw}woWxWB3d;kK0oH?2M=+wj4NMB
zpNe#icTW*=>q_rLZk|FxI_<{}2Q<AaxTv#JGiZ;xC(_9)sbiWt4Me;8`TkJ6=rxXa
z@EXS(y~goIu5rA`{~2$IIKN||0Nt;wwR$eATTbVhzDvws3sTmxC<||&ux>E!Z^R{W
zwU|<FUxM7T335*-##Q|~LGGypxhE6k`V!=xNQ|rgRf61O337WD<lxmvx-T3hBu}vf
zwuI=a4dtqFXN6{~ebPBjriwFVruQ|fas}N2uez@2-Xhs!#8`BDA&NtK*j~nxH}tMh
zMctOU?d57~Y2i3##bknUcIpXP0r?cFHM!m73a7PK3rXSRlE}RRH=JC3<~~7jFu6^?
zRZvbP57@T}OV1=9jNC7%K2IK*wh6kBV#eSS^n(gx%>zRIsKQy{7K|qqBjZiNN`arL
zXctT)ys>DPu;vZk+4Z1Ma-1KTct}`xfuE5-BACDCjg=vx@^PiJdyim!PB}8!BW!wA
zIn&=ORJ~7c6|vOAtE(I?K!&_uZv=<luR3@!q2FXGS-T397OGolg^e}<N53<z0Zj!h
z2gM?CtU(!CsM3!=BtKTCZTv%j1@lnRUkg6n?@`gegZZfFZ(}|x`a779ihh9ksOUe*
zd{p#b0H3Z~RP<kEJ}UZeF&`EEli<^}hKl|Sc-ioWN`2)@m=~BJ>Z78c%Y0PyZwH^g
zq@tp~jrpkP?_xeG`cH%ZBZz~F{z2xWqW>=QQPKYrd_BZLMgKZjgVaYwznJ-`=-<P9
zRP<ZGr{5`2sZZBi5BPNbpdvpAKAlHY<X>WOP>~-4pRQR{<WDdk75RTN9~JqpS$tIF
z)$-W?B~X#i0-ruJQIXFDUkyGg@+R<?gO7^*9gIgs{$9qTBJXEBD)OC-M@4=g<57|S
zE#py<Kg4)c<WDdj75V=#9u@g-8IOv*7G56d{e_CWk@=`159_3a`KZWSnU9KmHS<xC
zcQPLp`8MXGA|GNtD)Ix&M@4>w`KZVrWIihLhnbIx{7LZX-i?a<dGP5rk4pVeJ|Q22
zPuCkN^51|@*C#6S96U&9d{pG=hg=#T75OaiX?#@VZv>ylM@8Pm;-ey8#^R$Qzn;ZM
zMgA@p9~Jqnj7LTO0mh>u?_)eF@&U%9BHsf(-A_=F-v|En;G<HX?*9kCr{8)}k)Hye
zei21QeunX=$kUGx;uln=$U_LS4c-a*VV{O~f+5P-Y)vgKjW)m2Tkoi|x$4?%ZQr-r
zzi)LgtH!;f_LsIgSL5usIR}T$Inq(_M~gAkk&dDcj@s^Z)kzLDYv7n!1BbLaydIm+
z)lu(uw=@>eQ!St;THx`w75KdM+Y9`zx`MXFI@=fP>{zI?Y0<c1?}CBF-UTDq`W=g<
z9P3^%cC35B;7vaeo%Jsmp7k#npH2sz4awrEad^G1dhgt&WQK`R;-ogzH`T^XVq<+H
zPz8`KgX-#AT($MKt7F^UjPkYn{Pm3sP|(RRzt`ar)1ejSxk_w~y1E^r?SooDtIO{!
z*wIq&ZE){sEpV+aW?*w1*t{4B5q-@yEF#nvi#Q85F9t%y#%4E*2(`r`&VtR0fv(ks
zvD2~Hyv;tpEv_x@8_o0j<Jw}s$kXKG$H&@p<Kz0p@p1j)c$#Yb_*i>xd|bacKCZvc
z)#7&**<ja*X>1U-nz16<jXUZ@7B$9Vs~IaXw%HwHnTn>v=Zv|t6|}t~40fy{+HP#k
zRzy3tV>z1}8r*T~-REp@7x>znY71ONg>0dTWTV4h>$ExCZd*~|+#Mvk1>pX`nBp}H
vk11YrYr*OQQ&S7<mBnjTHMO|vT#i+>&2<j9tH#yj_qv*Vu38^`s6hE2se$k-

diff --git a/crates/rustc_codegen_nvvm/libintrinsics.ll b/crates/rustc_codegen_nvvm/libintrinsics.ll
index d9cb5e2d..88594cb7 100644
--- a/crates/rustc_codegen_nvvm/libintrinsics.ll
+++ b/crates/rustc_codegen_nvvm/libintrinsics.ll
@@ -8,86 +8,6 @@ source_filename = "libintrinsics"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; thread ----
-
-define i32 @__nvvm_thread_idx_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_thread_idx_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_thread_idx_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  ret i32 %0
-}
-
-; block dimension ----
-
-define i32 @__nvvm_block_dim_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_dim_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_dim_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  ret i32 %0
-}
-
-; block idx ----
-
-define i32 @__nvvm_block_idx_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_idx_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_idx_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-  ret i32 %0
-}
-
-; grid dimension ---- 
-
-define i32 @__nvvm_grid_dim_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_grid_dim_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_grid_dim_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-  ret i32 %0
-}
-
 ; warp ----
 
 define i32 @__nvvm_warp_size() #0 {
@@ -96,18 +16,6 @@ start:
   ret i32 %0
 }
 
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
 ; other ----