From a4db20b970d334f43fac86d1c388d07ab90ba110 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 5 Nov 2025 02:14:29 +0000
Subject: [PATCH 01/79] add ltoir test support

---
 cuda_core/tests/test_program.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index adc778973c..5c966296fe 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -387,3 +387,33 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     assert ".visible .entry simple(" in ptx_text
 
     program.close()
+
+@nvvm_available
+@pytest.mark.parametrize(
+    "options",
+    [
+       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+       ProgramOptions(
+            name="ltoir_test3",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+       ),
+    ],
+)
+def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
+    """Test NVVM programs for LTOIR with different options"""
+    program = Program(nvvm_ir, "nvvm", options)
+    assert program.backend == "NVVM"
+
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == options.name
+    code_content = ltoir_code.code
+    assert len(ltoir_code.code) > 0
+    program.close()

From fb6cfb3902b8a22aba92624ea417616033018852 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 04:10:17 +0000
Subject: [PATCH 02/79] add options for multi-modules

---
 cuda_core/cuda/core/experimental/_program.py | 30 ++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1db453fed1..2e0115be48 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,6 +298,7 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
+    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -479,6 +480,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
@@ -487,6 +491,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
         elif code_type == "ptx":
             assert_type(code, str)
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the PTX backend.")
+            
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -502,6 +509,29 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            if options.extra_sources is not None:
+                if isinstance(options.extra_sources, (str, bytes)):
+                    extra_sources = [options.extra_sources]
+                elif isinstance(options.extra_sources, (list, tuple)):
+                    extra_sources = options.extra_sources
+                else:
+                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
+                
+                if len(extra_sources) == 0:
+                    raise ValueError("extra_sources cannot be empty if provided")
+                
+                for i, extra_source in enumerate(extra_sources):
+                    if isinstance(extra_source, str):
+                        extra_source = extra_source.encode("utf-8")
+                    elif not isinstance(extra_source, (bytes, bytearray)):
+                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
+                    
+                    if len(extra_source) == 0:
+                        raise ValueError(f"Extra source {i} cannot be empty")
+                    
+                    extra_name = f"{options.name}_extra_{i}"
+                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+            
             self._backend = "NVVM"
             self._linker = None
 

From 7aaed4e4f36b890b73098430ae39384217e3d4d1 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 11:05:56 +0000
Subject: [PATCH 03/79] add tests

---
 cuda_core/tests/test_program.py | 119 ++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 5c966296fe..70b131556d 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -417,3 +417,122 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     code_content = ltoir_code.code
     assert len(ltoir_code.code) > 0
     program.close()
+
+
+@nvvm_available
+def test_nvvm_program_with_single_extra_source(nvvm_ir):
+    """Test NVVM program with a single extra source"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    #helper nvvm ir for multiple module loading
+    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    program = Program(nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "multi_module_test"
+    
+    program.close()
+
+@nvvm_available
+def test_nvvm_program_with_multiple_extra_sources():
+    """Test NVVM program with multiple extra sources"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    
+    main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i32 @helper_add(i32) nounwind readnone
+declare i32 @helper_mul(i32) nounwind readnone
+
+define void @main_kernel(i32* %data) {{
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
+  %val = load i32, i32* %ptr, align 4
+  
+  %val1 = call i32 @helper_add(i32 %val)
+  %val2 = call i32 @helper_mul(i32 %val1)
+  
+  store i32 %val2, i32* %ptr, align 4
+  ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) nounwind readnone {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_mul(i32 %x) nounwind readnone {{
+entry:
+  %result = mul i32 %x, 2
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test", 
+        extra_sources=[helper1_ir, helper2_ir]
+    )
+    program = Program(main_nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "nvvm_multi_helper_test"
+    
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == "nvvm_multi_helper_test"
+    
+    program.close()
+       
+def test_cpp_program_with_extra_sources():
+    #negative test with NVRTC with multiple sources
+    code = 'extern "C" __global__ void my_kernel(){}'
+    helper = 'extern "C" __global__ void helper(){}'
+    options = ProgramOptions(extra_sources = helper)
+    with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
+        Program(code, "c++", options)
+    
\ No newline at end of file

From 64c7f7d3d7acc4785bd475530786777e5babe2ab Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 14:48:43 +0000
Subject: [PATCH 04/79] add bitcode test

---
 cuda_core/tests/test_program.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 70b131556d..24ae767220 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -527,6 +527,51 @@ def test_nvvm_program_with_multiple_extra_sources():
     assert ltoir_code.name == "nvvm_multi_helper_test"
     
     program.close()
+
+@nvvm_available
+def test_bitcode_format():
+    import os
+    from pathlib import Path
+    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
+    if not bitcode_path:
+        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
+                    "Disabling the test.")
+    bitcode_file = Path(bitcode_path)
+    if not bitcode_file.exists():
+        pytest.skip(f"Bitcode file not found: {bitcode_path}")
+    
+    if not bitcode_file.suffix == '.bc':
+        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    
+    try:
+        with open(bitcode_file, 'rb') as f:
+            bitcode_data = f.read()
+        
+        if len(bitcode_data) < 4:
+            pytest.skip("Bitcode file appears to be empty or invalid")
+        
+        options = ProgramOptions(
+            name=f"existing_bitcode_{bitcode_file.stem}",
+            arch="sm_90"
+        )
+        program = Program(bitcode_data, "nvvm", options)
+        
+        assert program.backend == "NVVM"
+        ptx_result = program.compile("ptx")
+        assert isinstance(ptx_result, ObjectCode)
+        assert ptx_result.name.startswith("existing_bitcode_")
+        assert len(ptx_result.code) > 0
+        try:
+            ltoir_result = program.compile("ltoir")
+            assert isinstance(ltoir_result, ObjectCode)
+            assert len(ltoir_result.code) > 0
+            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
+        except Exception as e:
+            print(f"LTOIR compilation failed : {e}")
+        program.close()
+    except Exception as e:
+        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
+
        
 def test_cpp_program_with_extra_sources():
     #negative test with NVRTC with multiple sources

From 42ba301078bc17c796cddbe99499c957633eb5d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:51:09 +0000
Subject: [PATCH 05/79] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_program.py | 14 ++--
 cuda_core/tests/test_program.py              | 68 ++++++++++----------
 2 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 2e0115be48..eadd37ca95 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -480,7 +480,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -493,7 +493,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the PTX backend.")
-            
+
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -516,22 +516,22 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                     extra_sources = options.extra_sources
                 else:
                     raise TypeError("extra_sources must be str, bytes, list, or tuple")
-                
+
                 if len(extra_sources) == 0:
                     raise ValueError("extra_sources cannot be empty if provided")
-                
+
                 for i, extra_source in enumerate(extra_sources):
                     if isinstance(extra_source, str):
                         extra_source = extra_source.encode("utf-8")
                     elif not isinstance(extra_source, (bytes, bytearray)):
                         raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-                    
+
                     if len(extra_source) == 0:
                         raise ValueError(f"Extra source {i} cannot be empty")
-                    
+
                     extra_name = f"{options.name}_extra_{i}"
                     nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
-            
+
             self._backend = "NVVM"
             self._linker = None
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 24ae767220..9aca1878d4 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -388,13 +388,14 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
 
     program.close()
 
+
 @nvvm_available
 @pytest.mark.parametrize(
     "options",
     [
-       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
-       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
-       ProgramOptions(
+        ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+        ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
             name="ltoir_test3",
             arch="sm_90",
             ftz=True,
@@ -403,7 +404,7 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
             fma=True,
             device_code_optimize=True,
             link_time_optimization=True,
-       ),
+        ),
     ],
 )
 def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
@@ -423,9 +424,10 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
 def test_nvvm_program_with_single_extra_source(nvvm_ir):
     """Test NVVM program with a single extra source"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    #helper nvvm ir for multiple module loading
+    # helper nvvm ir for multiple module loading
     helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -441,22 +443,24 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
-    
+
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "multi_module_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_nvvm_program_with_multiple_extra_sources():
     """Test NVVM program with multiple extra sources"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    
+
     main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -468,10 +472,10 @@ def test_nvvm_program_with_multiple_extra_sources():
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
   %val = load i32, i32* %ptr, align 4
-  
+
   %val1 = call i32 @helper_add(i32 %val)
   %val2 = call i32 @helper_mul(i32 %val1)
-  
+
   store i32 %val2, i32* %ptr, align 4
   ret void
 }}
@@ -511,51 +515,46 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """
 
-    options = ProgramOptions(
-        name="nvvm_multi_helper_test", 
-        extra_sources=[helper1_ir, helper2_ir]
-    )
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "nvvm_multi_helper_test"
-    
+
     ltoir_code = program.compile("ltoir")
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == "nvvm_multi_helper_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_bitcode_format():
     import os
     from pathlib import Path
+
     bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
     if not bitcode_path:
-        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
-                    "Disabling the test.")
+        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
     bitcode_file = Path(bitcode_path)
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
-    
-    if not bitcode_file.suffix == '.bc':
+
+    if not bitcode_file.suffix == ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
-    
+
     try:
-        with open(bitcode_file, 'rb') as f:
+        with open(bitcode_file, "rb") as f:
             bitcode_data = f.read()
-        
+
         if len(bitcode_data) < 4:
             pytest.skip("Bitcode file appears to be empty or invalid")
-        
-        options = ProgramOptions(
-            name=f"existing_bitcode_{bitcode_file.stem}",
-            arch="sm_90"
-        )
+
+        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
         program = Program(bitcode_data, "nvvm", options)
-        
+
         assert program.backend == "NVVM"
         ptx_result = program.compile("ptx")
         assert isinstance(ptx_result, ObjectCode)
@@ -572,12 +571,11 @@ def test_bitcode_format():
     except Exception as e:
         pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
 
-       
+
 def test_cpp_program_with_extra_sources():
-    #negative test with NVRTC with multiple sources
+    # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'
     helper = 'extern "C" __global__ void helper(){}'
-    options = ProgramOptions(extra_sources = helper)
+    options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
-    
\ No newline at end of file

From 7ca68992514e63cffbd51642a59a9f277e79540f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 18:14:22 +0000
Subject: [PATCH 06/79] fix format

---
 cuda_core/tests/test_program.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9aca1878d4..7d3e1c8709 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -416,7 +416,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == options.name
     code_content = ltoir_code.code
-    assert len(ltoir_code.code) > 0
+    assert len(code_content) > 0
     program.close()
 
 
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
@@ -542,7 +542,7 @@ def test_bitcode_format():
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
 
-    if not bitcode_file.suffix == ".bc":
+    if bitcode_file.suffix != ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
 
     try:

From 0674ea17dda35f886279bdef2f33a4aa56071562 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:15:33 +0000
Subject: [PATCH 07/79] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 7d3e1c8709..1b96a5f094 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)

From 03b122466a0eb10b4adb1b692b4a36b017f7241e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 26 Nov 2025 10:26:42 +0000
Subject: [PATCH 08/79] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index eadd37ca95..7d3ea15e46 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -479,11 +479,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-
+            
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"

From 033f11cacf560de6ccf1bf9605070cc912081ddf Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 11:45:49 +0000
Subject: [PATCH 09/79] apply bitcode file from cupy_test helpers

---
 cuda_core/tests/test_program.py               |  61 ++++----
 .../cuda_python_test_helpers/nvvm_bitcode.py  | 136 ++++++++++++++++++
 2 files changed, 163 insertions(+), 34 deletions(-)
 create mode 100644 cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 1b96a5f094..c62e1712f6 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,13 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+try:
+    from cuda_python_test_helpers.nvvm_bitcode import (
+        minimal_nvvmir
+    )
+    _test_helpers_available = True
+except ImportError:
+    _test_helpers_available = False
 
 def _is_nvvm_available():
     """Check if NVVM is available."""
@@ -531,46 +538,32 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-def test_bitcode_format():
+@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
+    
+    if len(minimal_nvvmir) < 4:
+        pytest.skip("Bitcode file is not valid or empty")
 
-    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
-    if not bitcode_path:
-        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
-    bitcode_file = Path(bitcode_path)
-    if not bitcode_file.exists():
-        pytest.skip(f"Bitcode file not found: {bitcode_path}")
-
-    if bitcode_file.suffix != ".bc":
-        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    program = Program(minimal_nvvmir, "nvvm", options)
 
+    assert program.backend == "NVVM"
+    ptx_result = program.compile("ptx")
+    assert isinstance(ptx_result, ObjectCode)
+    assert ptx_result.name == "minimal_nvvmir_bitcode_test"
+    assert len(ptx_result.code) > 0
+    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
-        with open(bitcode_file, "rb") as f:
-            bitcode_data = f.read()
-
-        if len(bitcode_data) < 4:
-            pytest.skip("Bitcode file appears to be empty or invalid")
-
-        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
-        program = Program(bitcode_data, "nvvm", options)
-
-        assert program.backend == "NVVM"
-        ptx_result = program.compile("ptx")
-        assert isinstance(ptx_result, ObjectCode)
-        assert ptx_result.name.startswith("existing_bitcode_")
-        assert len(ptx_result.code) > 0
-        try:
-            ltoir_result = program.compile("ltoir")
-            assert isinstance(ltoir_result, ObjectCode)
-            assert len(ltoir_result.code) > 0
-            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
-        except Exception as e:
-            print(f"LTOIR compilation failed : {e}")
-        program.close()
+        ltoir_result = program_lto.compile("ltoir")
+        assert isinstance(ltoir_result, ObjectCode)
+        assert len(ltoir_result.code) > 0
+        print(f"LTOIR size: {len(ltoir_result.code)} bytes")
     except Exception as e:
-        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
-
+        print(f"LTOIR compilation failed : {e}")
+    finally:
+        program.close()
 
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
new file mode 100644
index 0000000000..cba22d5425
--- /dev/null
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -0,0 +1,136 @@
+import binascii
+import re
+from contextlib import contextmanager
+
+import pytest
+from cuda.bindings import nvvm
+
+MINIMAL_NVVMIR_TXT_TEMPLATE = b"""\
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @kernel() {
+entry:
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
+
+!nvvmir.version = !{!1}
+!1 = !{i32 %d, i32 0, i32 %d, i32 0}
+"""  # noqa: E501
+
+MINIMAL_NVVMIR_BITCODE_STATIC = {
+    (1, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000"
+    "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40"
+    "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b"
+    "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6"
+    "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4"
+    "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc"
+    "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e"
+    "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000"
+    "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022"
+    "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000"
+    "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000"
+    "1f000000080000000600000000000000700000000000000000000000010000000000000000000000"
+    "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000"
+    "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00"
+    "00000000",
+    (2, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000"
+    "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320"
+    "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb"
+    "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339"
+    "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f"
+    "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c"
+    "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150"
+    "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000"
+    "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000"
+    "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000"
+    "060000004c000000010000005800000000000000580000000100000070000000000000000c000000"
+    "130000001f0000000800000006000000000000007000000000000000000000000100000000000000"
+    "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467"
+    "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269"
+    "6e673e0000000000",
+}
+
+@pytest.fixture(params=("txt", "bitcode_static"))
+def minimal_nvvmir(request):
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+
+    if request.param == "txt":
+        return MINIMAL_NVVMIR_TXT_TEMPLATE % (major, debug_major)
+
+    bitcode_static_binascii = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major))
+    if bitcode_static_binascii:
+        return binascii.unhexlify(bitcode_static_binascii)
+    raise RuntimeError(
+        "Static bitcode for NVVM IR version "
+        f"{major}.{debug_major} is not available in this test.\n"
+        "Maintainers: Please run the helper script to generate it and add the "
+        "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
+        "  ../../toolshed/build_static_bitcode_input.py"
+    )
\ No newline at end of file

From 6e411ee8619868ec7ac6d4b4304a2e950edf490c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 17:44:23 +0000
Subject: [PATCH 10/79] use 2 tuples

---
 cuda_core/cuda/core/experimental/_program.py | 58 ++++++++++++--------
 cuda_core/tests/test_program.py              | 11 +++-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7d3ea15e46..ba6c4cf0af 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,7 +298,10 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
-    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
+    # Creating as 2 tuples ((names, source), (names,source))
+    extra_sources: (
+        Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
+    ) = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -468,13 +471,14 @@ def close(self):
                     nvvm.destroy_program(self.handle)
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
+        self._module_count = 0
 
         if code_type == "c++":
             assert_type(code, str)
@@ -509,28 +513,36 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            self._module_count = 1
+             # Add extra modules if provided
             if options.extra_sources is not None:
-                if isinstance(options.extra_sources, (str, bytes)):
-                    extra_sources = [options.extra_sources]
-                elif isinstance(options.extra_sources, (list, tuple)):
-                    extra_sources = options.extra_sources
-                else:
-                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
-
-                if len(extra_sources) == 0:
-                    raise ValueError("extra_sources cannot be empty if provided")
-
-                for i, extra_source in enumerate(extra_sources):
-                    if isinstance(extra_source, str):
-                        extra_source = extra_source.encode("utf-8")
-                    elif not isinstance(extra_source, (bytes, bytearray)):
-                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-
-                    if len(extra_source) == 0:
-                        raise ValueError(f"Extra source {i} cannot be empty")
-
-                    extra_name = f"{options.name}_extra_{i}"
-                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+                if not is_sequence(options.extra_sources):
+                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                for i, module in enumerate(options.extra_sources):
+                    if not isinstance(module, tuple) or len(module) != 2:
+                        raise TypeError(
+                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                        )
+                    
+                    module_name, module_source = module
+                    
+                    if not isinstance(module_name, str):
+                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+                    
+                    if isinstance(module_source, str):
+                        # Textual LLVM IR - encode to UTF-8 bytes
+                        module_source = module_source.encode("utf-8")
+                    elif not isinstance(module_source, (bytes, bytearray)):
+                        raise TypeError(
+                            f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                            f"or bytearray, got {type(module_source).__name__}"
+                        )
+                    
+                    if len(module_source) == 0:
+                        raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+                    
+                    nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
+                    self._module_count += 1
 
             self._backend = "NVVM"
             self._linker = None
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c62e1712f6..aec975e84e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -435,7 +435,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
     # helper nvvm ir for multiple module loading
-    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+    helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @helper_add(i32 %x) {{
@@ -448,7 +448,12 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    options = ProgramOptions(
+            name="multi_module_test",
+            extra_sources=[
+                ("helper", helper_nvvmir),
+            ]
+        )   
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -522,7 +527,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"

From aeb26aa1f8dcdaf8a81f6715faa47c40703ced4e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 04:16:32 +0000
Subject: [PATCH 11/79] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 9f74c3b10a..3aab5de3ae 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-
+    
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -423,8 +423,6 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
-        if self.numba_debug:
-            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b3d6d960faa8f9ff130cd558e91b56213d1b448f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 07:54:25 +0000
Subject: [PATCH 12/79] format

---
 cuda_core/cuda/core/experimental/_program.py | 27 +++++-----
 cuda_core/tests/test_program.py              | 57 +++++++-------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 3aab5de3ae..efdb39807a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, List, Tuple, Union
 from warnings import warn
 
 if TYPE_CHECKING:
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-    
+
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -484,7 +484,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -515,21 +515,24 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
             self._module_count = 1
-             # Add extra modules if provided
+            # Add extra modules if provided
             if options.extra_sources is not None:
                 if not is_sequence(options.extra_sources):
-                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                    raise TypeError(
+                        "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
+                    )
                 for i, module in enumerate(options.extra_sources):
                     if not isinstance(module, tuple) or len(module) != 2:
                         raise TypeError(
-                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                            f"Each extra module must be a 2-tuple (name, source)"
+                            f", got {type(module).__name__} at index {i}"
                         )
-                    
+
                     module_name, module_source = module
-                    
+
                     if not isinstance(module_name, str):
-                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
-                    
+                        raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")
+
                     if isinstance(module_source, str):
                         # Textual LLVM IR - encode to UTF-8 bytes
                         module_source = module_source.encode("utf-8")
@@ -538,10 +541,10 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                             f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
                             f"or bytearray, got {type(module_source).__name__}"
                         )
-                    
+
                     if len(module_source) == 0:
                         raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
-                    
+
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e0b9a61023..f9cb8dc5f2 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -15,13 +15,13 @@
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
 try:
-    from cuda_python_test_helpers.nvvm_bitcode import (
-        minimal_nvvmir
-    )
+    from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir
+
     _test_helpers_available = True
 except ImportError:
     _test_helpers_available = False
 
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -38,29 +38,12 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
     _cuda_driver_version = 0
 
-
-def _get_nvrtc_version_for_tests():
-    """
-    Get NVRTC version.
-
-    Returns:
-        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
-        None: If NVRTC is not available
-    """
-    try:
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        version = nvrtc_major * 1000 + nvrtc_minor * 100
-        return version
-    except Exception:
-        return None
-
-
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -200,13 +183,6 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
-        pytest.param(
-            ProgramOptions(debug=True, numba_debug=True),
-            marks=pytest.mark.skipif(
-                (_get_nvrtc_version_for_tests() or 0) < 13200,
-                reason="numba_debug requires NVRTC >= 13.2",
-            ),
-        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -473,11 +449,11 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 """  # noqa: E501
 
     options = ProgramOptions(
-            name="multi_module_test",
-            extra_sources=[
-                ("helper", helper_nvvmir),
-            ]
-        )   
+        name="multi_module_test",
+        extra_sources=[
+            ("helper", helper_nvvmir),
+        ],
+    )
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -551,7 +527,13 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test",
+        extra_sources=[
+            ("helper1", helper1_ir),
+            ("helper2", helper2_ir),
+        ],
+    )
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -567,11 +549,11 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
-    
+
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
@@ -583,7 +565,7 @@ def test_bitcode_format(minimal_nvvmir):
     assert isinstance(ptx_result, ObjectCode)
     assert ptx_result.name == "minimal_nvvmir_bitcode_test"
     assert len(ptx_result.code) > 0
-    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
+    program_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
         ltoir_result = program_lto.compile("ltoir")
         assert isinstance(ltoir_result, ObjectCode)
@@ -594,6 +576,7 @@ def test_bitcode_format(minimal_nvvmir):
     finally:
         program.close()
 
+
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'

From edd6401cc5300c03adb583c021da60b6da18bfe6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 07:55:54 +0000
Subject: [PATCH 13/79] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py                              | 5 +----
 .../cuda_python_test_helpers/nvvm_bitcode.py                 | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index f9cb8dc5f2..d0de56a41c 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -551,13 +551,10 @@ def test_nvvm_program_with_multiple_extra_sources():
 @nvvm_available
 @pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
-    import os
-    from pathlib import Path
-
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
-    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90")
     program = Program(minimal_nvvmir, "nvvm", options)
 
     assert program.backend == "NVVM"
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index cba22d5425..e42dbff085 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,6 +1,4 @@
 import binascii
-import re
-from contextlib import contextmanager
 
 import pytest
 from cuda.bindings import nvvm
@@ -117,6 +115,7 @@
     "6e673e0000000000",
 }
 
+
 @pytest.fixture(params=("txt", "bitcode_static"))
 def minimal_nvvmir(request):
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -133,4 +132,4 @@ def minimal_nvvmir(request):
         "Maintainers: Please run the helper script to generate it and add the "
         "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
         "  ../../toolshed/build_static_bitcode_input.py"
-    )
\ No newline at end of file
+    )

From 8dbbafece0fc088fa8fbb7d042347f6ce905f9b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Dec 2025 17:59:25 +0000
Subject: [PATCH 14/79] fix from upstream

---
 cuda_core/cuda/core/experimental/_program.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index efdb39807a..2610bc1232 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -423,6 +423,8 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
+        if self.numba_debug:
+            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b78f0c3e820bce5b6f70903d3d83e10821b02fba Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:22:24 +0000
Subject: [PATCH 15/79] refresh from upstream

---
 cuda_core/tests/test_program.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 91567fdced..02b9899eea 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -184,6 +184,13 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
+        pytest.param(
+            ProgramOptions(debug=True, numba_debug=True),
+            marks=pytest.mark.skipif(
+                (_get_nvrtc_version_for_tests() or 0) < 13200,
+                reason="numba_debug requires NVRTC >= 13.2",
+            ),
+        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -714,4 +721,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     """Test that unsupported options raise CUDAError for NVVM backend"""
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
-        options.as_bytes("nvvm")
+        options.as_bytes("nvvm")
\ No newline at end of file

From 99a5593d0d82d5de399228eadf4fc6b6ed825ff6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:27:55 +0000
Subject: [PATCH 16/79] fix tests

---
 cuda_core/tests/test_program.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 02b9899eea..00d0709870 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -45,6 +45,21 @@ def _is_nvvm_available():
 except Exception:
     _cuda_driver_version = 0
 
+
+def _get_nvrtc_version_for_tests():
+    """
+    Get NVRTC version.
+    Returns:
+        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
+        None: If NVRTC is not available
+    """
+    try:
+        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+        version = nvrtc_major * 1000 + nvrtc_minor * 100
+        return version
+    except Exception:
+        return None
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 

From 783f6e5a419b74757c7620824a7a23ec847321fd Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:29:35 +0000
Subject: [PATCH 17/79] take path_finder from PR 447

---
 .../bindings/_internal/nvjitlink_linux.pyx    |   1 +
 .../cuda/bindings/_path_finder/cuda_paths.py  | 400 ++++++++++++++++++
 .../find_nvidia_dynamic_library.py            | 139 ++++++
 .../cuda/bindings/_path_finder/findlib.py     |  69 +++
 .../load_nvidia_dynamic_library.py            |  92 ++++
 .../_path_finder/sys_path_find_sub_dirs.py    |  40 ++
 cuda_bindings/cuda/bindings/path_finder.py    |  37 ++
 7 files changed, 778 insertions(+)
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
 create mode 100644 cuda_bindings/cuda/bindings/path_finder.py

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 8c96b6d640..d095b3f524 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -12,6 +12,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
 
+
 ###############################################################################
 # Extern
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
new file mode 100644
index 0000000000..c827699b3a
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
@@ -0,0 +1,400 @@
+import os
+import platform
+import re
+import site
+import sys
+import traceback
+import warnings
+from collections import namedtuple
+from pathlib import Path
+
+from .findlib import find_file, find_lib
+
+IS_WIN32 = sys.platform.startswith("win32")
+
+_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
+
+
+def _get_numba_CUDA_INCLUDE_PATH():
+    # From numba/numba/core/config.py
+
+    def _readenv(name, ctor, default):
+        value = os.environ.get(name)
+        if value is None:
+            return default() if callable(default) else default
+        try:
+            return ctor(value)
+        except Exception:
+            warnings.warn(  # noqa: B028
+                f"Environment variable '{name}' is defined but "
+                f"its associated value '{value}' could not be "
+                "parsed.\nThe parse failed with exception:\n"
+                f"{traceback.format_exc()}",
+                RuntimeWarning,
+            )
+            return default
+
+    if IS_WIN32:
+        cuda_path = os.environ.get("CUDA_PATH")
+        if cuda_path:  # noqa: SIM108
+            default_cuda_include_path = os.path.join(cuda_path, "include")
+        else:
+            default_cuda_include_path = "cuda_include_not_found"
+    else:
+        default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include")
+    CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path)
+    return CUDA_INCLUDE_PATH
+
+
+config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH()
+
+
+def _find_valid_path(options):
+    """Find valid path from *options*, which is a list of 2-tuple of
+    (name, path).  Return first pair where *path* is not None.
+    If no valid path is found, return ('<unknown>', None)
+    """
+    for by, data in options:
+        if data is not None:
+            return by, data
+    else:
+        return "<unknown>", None
+
+
+def _get_libdevice_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
+        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
+        ("Debian package", get_debian_pkg_libdevice()),
+        ("NVIDIA NVCC Wheel", get_libdevice_wheel()),
+    ]
+    libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice")
+    if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir):
+        options.append(("System", libdevice_ctk_dir))
+
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _nvvm_lib_dir():
+    if IS_WIN32:
+        return "nvvm", "bin"
+    else:
+        return "nvvm", "lib64"
+
+
+def _get_nvvm_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
+        ("NVIDIA NVCC Wheel", _get_nvvm_wheel()),
+    ]
+    # need to ensure nvvm dir actually exists
+    nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir())
+    if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir):
+        options.append(("System", nvvm_ctk_dir))
+
+    by, path = _find_valid_path(options)
+    return by, path
+
+
+def _get_nvvm_wheel():
+    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
+    for sp in site_paths:
+        # The SONAME is taken based on public CTK 12.x releases
+        if sys.platform.startswith("linux"):
+            dso_dir = "lib64"
+            # Hack: libnvvm from Linux wheel
+            # does not have any soname (CUDAINST-3183)
+            dso_path = "libnvvm.so"
+        elif sys.platform.startswith("win32"):
+            dso_dir = "bin"
+            dso_path = "nvvm64_40_0.dll"
+        else:
+            raise AssertionError()
+
+        if sp is not None:
+            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
+            dso_path = os.path.join(dso_dir, dso_path)
+            if os.path.exists(dso_path):
+                return str(Path(dso_path).parent)
+
+
+def _get_libdevice_paths():
+    by, libdir = _get_libdevice_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        out = os.path.join(libdir, "libdevice.10.bc")
+    else:
+        # Search for pattern
+        pat = r"libdevice(\.\d+)*\.bc$"
+        candidates = find_file(re.compile(pat), libdir)
+        # Keep only the max (most recent version) of the bitcode files.
+        out = max(candidates, default=None)
+    return _env_path_tuple(by, out)
+
+
+def _cudalib_path():
+    if IS_WIN32:
+        return "bin"
+    else:
+        return "lib64"
+
+
+def _cuda_home_static_cudalib_path():
+    if IS_WIN32:
+        return ("lib", "x64")
+    else:
+        return ("lib64",)
+
+
+def _get_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_static_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_cudalib_dir():
+    by, libdir = _get_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def _get_static_cudalib_dir():
+    by, libdir = _get_static_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def get_system_ctk(*subdirs):
+    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
+    # Linux?
+    if sys.platform.startswith("linux"):
+        # Is cuda alias to /usr/local/cuda?
+        # We are intentionally not getting versioned cuda installation.
+        base = "/usr/local/cuda"
+        if os.path.exists(base):
+            return os.path.join(base, *subdirs)
+
+
+def get_conda_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+    # Assume the existence of NVVM to imply cudatoolkit installed
+    paths = find_lib("nvvm")
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_nvvm_ctk():
+    """Return path to directory containing the NVVM shared library."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
+    # conda package is installed.
+
+    # First, try the location used on Linux and the Windows 11.x packages
+    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
+    if not os.path.exists(libdir) or not os.path.isdir(libdir):
+        # If that fails, try the location used for Windows 12.x packages
+        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
+        if not os.path.exists(libdir) or not os.path.isdir(libdir):
+            # If that doesn't exist either, assume we don't have the NVIDIA
+            # conda package
+            return
+
+    paths = find_lib("nvvm", libdir=libdir)
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_libdevice_ctk():
+    """Return path to directory containing the libdevice library."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    nvvm_dir = os.path.dirname(nvvm_ctk)
+    return os.path.join(nvvm_dir, "libdevice")
+
+
+def get_nvidia_cudalib_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    subdir = "bin" if IS_WIN32 else "lib"
+    return os.path.join(env_dir, subdir)
+
+
+def get_nvidia_static_cudalib_ctk():
+    """Return path to directory containing the static libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+
+    if IS_WIN32 and ("Library" not in nvvm_ctk):  # noqa: SIM108
+        # Location specific to CUDA 11.x packages on Windows
+        dirs = ("Lib", "x64")
+    else:
+        # Linux, or Windows with CUDA 12.x packages
+        dirs = ("lib",)
+
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    return os.path.join(env_dir, *dirs)
+
+
+def get_cuda_home(*subdirs):
+    """Get paths of CUDA_HOME.
+    If *subdirs* are the subdirectory name to be appended in the resulting
+    path.
+    """
+    cuda_home = os.environ.get("CUDA_HOME")
+    if cuda_home is None:
+        # Try Windows CUDA installation without Anaconda
+        cuda_home = os.environ.get("CUDA_PATH")
+    if cuda_home is not None:
+        return os.path.join(cuda_home, *subdirs)
+
+
+def _get_nvvm_path():
+    by, path = _get_nvvm_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        path = os.path.join(path, "libnvvm.so")
+    else:
+        candidates = find_lib("nvvm", path)
+        path = max(candidates) if candidates else None
+    return _env_path_tuple(by, path)
+
+
+def get_cuda_paths():
+    """Returns a dictionary mapping component names to a 2-tuple
+    of (source_variable, info).
+    The returned dictionary will have the following keys and infos:
+    - "nvvm": file_path
+    - "libdevice": List[Tuple[arch, file_path]]
+    - "cudalib_dir": directory_path
+    Note: The result of the function is cached.
+    """
+    # Check cache
+    if hasattr(get_cuda_paths, "_cached_result"):
+        return get_cuda_paths._cached_result
+    else:
+        # Not in cache
+        d = {
+            "nvvm": _get_nvvm_path(),
+            "libdevice": _get_libdevice_paths(),
+            "cudalib_dir": _get_cudalib_dir(),
+            "static_cudalib_dir": _get_static_cudalib_dir(),
+            "include_dir": _get_include_dir(),
+        }
+        # Cache result
+        get_cuda_paths._cached_result = d
+        return d
+
+
+def get_debian_pkg_libdevice():
+    """
+    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
+    exists.
+    """
+    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
+    if not os.path.exists(pkg_libdevice_location):
+        return None
+    return pkg_libdevice_location
+
+
+def get_libdevice_wheel():
+    nvvm_path = _get_nvvm_wheel()
+    if nvvm_path is None:
+        return None
+    nvvm_path = Path(nvvm_path)
+    libdevice_path = nvvm_path.parent / "libdevice"
+
+    return str(libdevice_path)
+
+
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+
+    if system == "Linux":
+        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
+    elif system == "Windows":
+        arch_to_targets = {
+            "AMD64": "x64",
+        }
+    else:
+        arch_to_targets = {}
+
+    return arch_to_targets.get(machine, None)
+
+
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    if platform.system() == "Windows":
+        include_dir = os.path.join(sys.prefix, "Library", "include")
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(sys.prefix, "targets", target_name, "include")
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, "include")
+
+    if (
+        os.path.exists(include_dir)
+        and os.path.isdir(include_dir)
+        and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h"))
+    ):
+        return include_dir
+    return
+
+
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
+        ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..d8413282e6
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
@@ -0,0 +1,139 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import glob
+import os
+
+from .cuda_paths import IS_WIN32, get_cuda_paths
+from .sys_path_find_sub_dirs import sys_path_find_sub_dirs
+
+
+def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments):
+    error_messages.append(f"No such file: {file_wild}")
+    for sub_dir in sys_path_find_sub_dirs(sub_dirs):
+        attachments.append(f'  listdir("{sub_dir}"):')
+        for node in sorted(os.listdir(sub_dir)):
+            attachments.append(f"    {node}")
+
+
+def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "lib")
+    file_wild = so_basename + "*"
+    for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        # First look for an exact match
+        so_name = os.path.join(lib_dir, so_basename)
+        if os.path.isfile(so_name):
+            return so_name
+        # Look for a versioned library
+        # Using sort here mainly to make the result deterministic.
+        for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
+            so_name = os.path.join(lib_dir, node)
+            if os.path.isfile(so_name):
+                return so_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "bin")
+    file_wild = libname + "*.dll"
+    for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))):
+            dll_name = os.path.join(bin_dir, node)
+            if os.path.isfile(dll_name):
+                return dll_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _get_cuda_paths_info(key, error_messages):
+    env_path_tuple = get_cuda_paths()[key]
+    if not env_path_tuple:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]')
+        return None
+    if not env_path_tuple.info:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info')
+        return None
+    return env_path_tuple.info
+
+
+def _find_so_using_cudalib_dir(so_basename, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    primary_so_dir = cudalib_dir + "/"
+    candidate_so_dirs = [primary_so_dir]
+    libs = ["/lib/", "/lib64/"]
+    for _ in range(2):
+        alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1))
+        if alt_dir not in candidate_so_dirs:
+            candidate_so_dirs.append(alt_dir)
+        libs.reverse()
+    candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs]
+    error_messages = []
+    for so_name in candidate_so_names:
+        if os.path.isfile(so_name):
+            return so_name
+        error_messages.append(f"No such file: {so_name}")
+    for so_dirname in candidate_so_dirs:
+        attachments.append(f'  listdir("{so_dirname}"):')
+        if not os.path.isdir(so_dirname):
+            attachments.append("    DIRECTORY DOES NOT EXIST")
+        else:
+            for node in sorted(os.listdir(so_dirname)):
+                attachments.append(f"    {node}")
+    return None
+
+
+def _find_dll_using_cudalib_dir(libname, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    file_wild = libname + "*.dll"
+    for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))):
+        dll_name = os.path.join(cudalib_dir, node)
+        if os.path.isfile(dll_name):
+            return dll_name
+    error_messages.append(f"No such file: {file_wild}")
+    attachments.append(f'  listdir("{cudalib_dir}"):')
+    for node in sorted(os.listdir(cudalib_dir)):
+        attachments.append(f"    {node}")
+    return None
+
+
+@functools.cache
+def find_nvidia_dynamic_library(name: str) -> str:
+    error_messages = []
+    attachments = []
+
+    if IS_WIN32:
+        dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments)
+        if dll_name is None:
+            if name == "nvvm":
+                dll_name = _get_cuda_paths_info("nvvm", error_messages)
+            else:
+                dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments)
+        if dll_name is None:
+            attachments = "\n".join(attachments)
+            raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}")
+        return dll_name
+
+    so_basename = f"lib{name}.so"
+    so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments)
+    if so_name is None:
+        if name == "nvvm":
+            so_name = _get_cuda_paths_info("nvvm", error_messages)
+        else:
+            so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments)
+    if so_name is None:
+        attachments = "\n".join(attachments)
+        raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}")
+    return so_name
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
new file mode 100644
index 0000000000..c64c3c9577
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
@@ -0,0 +1,69 @@
+# Forked from:
+# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py
+
+import os
+import re
+import sys
+
+
+def get_lib_dirs():
+    """
+    Anaconda specific
+    """
+    if sys.platform == "win32":
+        # on windows, historically `DLLs` has been used for CUDA libraries,
+        # since approximately CUDA 9.2, `Library\bin` has been used.
+        dirnames = ["DLLs", os.path.join("Library", "bin")]
+    else:
+        dirnames = [
+            "lib",
+        ]
+    libdirs = [os.path.join(sys.prefix, x) for x in dirnames]
+    return libdirs
+
+
+DLLNAMEMAP = {
+    "linux": r"lib%(name)s\.so\.%(ver)s$",
+    "linux2": r"lib%(name)s\.so\.%(ver)s$",
+    "linux-static": r"lib%(name)s\.a$",
+    "darwin": r"lib%(name)s\.%(ver)s\.dylib$",
+    "win32": r"%(name)s%(ver)s\.dll$",
+    "win32-static": r"%(name)s\.lib$",
+    "bsd": r"lib%(name)s\.so\.%(ver)s$",
+}
+
+RE_VER = r"[0-9]*([_\.][0-9]+)*"
+
+
+def find_lib(libname, libdir=None, platform=None, static=False):
+    platform = platform or sys.platform
+    platform = "bsd" if "bsd" in platform else platform
+    if static:
+        platform = f"{platform}-static"
+    if platform not in DLLNAMEMAP:
+        # Return empty list if platform name is undefined.
+        # Not all platforms define their static library paths.
+        return []
+    pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER}
+    regex = re.compile(pat)
+    return find_file(regex, libdir)
+
+
+def find_file(pat, libdir=None):
+    if libdir is None:
+        libdirs = get_lib_dirs()
+    elif isinstance(libdir, str):
+        libdirs = [
+            libdir,
+        ]
+    else:
+        libdirs = list(libdir)
+    files = []
+    for ldir in libdirs:
+        try:
+            entries = os.listdir(ldir)
+        except FileNotFoundError:
+            continue
+        candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)]
+        files.extend([c for c in candidates if os.path.isfile(c)])
+    return files
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..69aadabcbf
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
@@ -0,0 +1,92 @@
+import functools
+import sys
+
+if sys.platform == "win32":
+    import ctypes.wintypes
+
+    import pywintypes
+    import win32api
+
+    # Mirrors WinBase.h (unfortunately not defined already elsewhere)
+    _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+
+else:
+    import ctypes
+    import os
+
+    _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL
+
+from .find_nvidia_dynamic_library import find_nvidia_dynamic_library
+
+
+@functools.cache
+def _windows_cuDriverGetVersion() -> int:
+    handle = win32api.LoadLibrary("nvcuda.dll")
+
+    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
+    GetProcAddress = kernel32.GetProcAddress
+    GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR]
+    GetProcAddress.restype = ctypes.c_void_p
+    cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion")
+    assert cuDriverGetVersion
+
+    FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int))
+    cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion)
+    driver_ver = ctypes.c_int()
+    err = cuDriverGetVersion_fn(ctypes.byref(driver_ver))
+    assert err == 0
+    return driver_ver.value
+
+
+@functools.cache
+def _windows_load_with_dll_basename(name: str) -> int:
+    driver_ver = _windows_cuDriverGetVersion()
+    del driver_ver  # Keeping this here because it will probably be needed in the future.
+
+    if name == "nvJitLink":
+        dll_name = "nvJitLink_120_0.dll"
+    elif name == "nvrtc":
+        dll_name = "nvrtc64_120_0.dll"
+    elif name == "nvvm":
+        dll_name = "nvvm64_40_0.dll"
+
+    try:
+        return win32api.LoadLibrary(dll_name)
+    except pywintypes.error:
+        pass
+
+    return None
+
+
+@functools.cache
+def load_nvidia_dynamic_library(name: str) -> int:
+    # First try using the platform-specific dynamic loader search mechanisms
+    if sys.platform == "win32":
+        handle = _windows_load_with_dll_basename(name)
+        if handle:
+            return handle
+    else:
+        dl_path = f"lib{name}.so"  # Version intentionally no specified.
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError:
+            pass
+        else:
+            # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+            return handle._handle  # C unsigned int
+
+    dl_path = find_nvidia_dynamic_library(name)
+    if sys.platform == "win32":
+        try:
+            handle = win32api.LoadLibrary(dl_path)
+        except pywintypes.error as e:
+            raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><intptr_t>` in cython to convert back to void*
+        return handle  # C signed int, matches win32api.GetProcAddress
+    else:
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError as e:
+            raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+        return handle._handle  # C unsigned int
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
new file mode 100644
index 0000000000..324cdeec30
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
@@ -0,0 +1,40 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import os
+import sys
+
+
+@functools.cache
+def _impl(sys_path, sub_dirs):
+    results = []
+    for base in sys_path:
+        stack = [(base, 0)]  # (current_path, index into sub_dirs)
+        while stack:
+            current_path, idx = stack.pop()
+            if idx == len(sub_dirs):
+                if os.path.isdir(current_path):
+                    results.append(current_path)
+                continue
+
+            sub = sub_dirs[idx]
+            if sub == "*":
+                try:
+                    entries = sorted(os.listdir(current_path))
+                except OSError:
+                    continue
+                for entry in entries:
+                    entry_path = os.path.join(current_path, entry)
+                    if os.path.isdir(entry_path):
+                        stack.append((entry_path, idx + 1))
+            else:
+                next_path = os.path.join(current_path, sub)
+                if os.path.isdir(next_path):
+                    stack.append((next_path, idx + 1))
+    return results
+
+
+def sys_path_find_sub_dirs(sub_dirs):
+    return _impl(tuple(sys.path), tuple(sub_dirs))
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py
new file mode 100644
index 0000000000..12c02b930b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/path_finder.py
@@ -0,0 +1,37 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.bindings._path_finder.cuda_paths import (
+    get_conda_ctk,
+    get_conda_include_dir,
+    get_cuda_home,
+    get_cuda_paths,
+    get_current_cuda_target_name,
+    get_debian_pkg_libdevice,
+    get_libdevice_wheel,
+    get_nvidia_cudalib_ctk,
+    get_nvidia_libdevice_ctk,
+    get_nvidia_nvvm_ctk,
+    get_nvidia_static_cudalib_ctk,
+    get_system_ctk,
+)
+from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library
+from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library
+
+__all__ = [
+    "find_nvidia_dynamic_library",
+    "load_nvidia_dynamic_library",
+    "get_conda_ctk",
+    "get_conda_include_dir",
+    "get_cuda_home",
+    "get_cuda_paths",
+    "get_current_cuda_target_name",
+    "get_debian_pkg_libdevice",
+    "get_libdevice_wheel",
+    "get_nvidia_cudalib_ctk",
+    "get_nvidia_libdevice_ctk",
+    "get_nvidia_nvvm_ctk",
+    "get_nvidia_static_cudalib_ctk",
+    "get_system_ctk",
+]
\ No newline at end of file

From 5dbfb2d0061d6ca9a656e151d922f4fd9ec255b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:54:03 +0000
Subject: [PATCH 18/79] add builder files

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 4 ++--
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index d095b3f524..785f587951 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t
 import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
-from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 
@@ -77,7 +77,7 @@ cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink")
     return <void*>handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 408a2cb592..15845e3228 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -10,6 +10,7 @@ import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 ###############################################################################
@@ -75,7 +76,7 @@ cdef void* __nvvmGetProgramLog = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm")
     return <void*>handle
 
 

From 0a9eea9967e017bb20eea4801fdbba16e078ce50 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 14:06:11 +0000
Subject: [PATCH 19/79] use python lists/tuples

---
 cuda_core/cuda/core/experimental/_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 4da8191f26..1422ceda5f 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union
 from warnings import warn
 
 if TYPE_CHECKING:

From 79138c0950aaee94426e5858df48010bb6db4530 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 18 Dec 2025 20:01:25 +0000
Subject: [PATCH 20/79] libdevice integration

---
 cuda_core/cuda/core/experimental/_program.py | 58 +++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1422ceda5f..a5ad2a3be2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -102,6 +102,47 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
+def _find_libdevice_path():
+    """
+    Find libdevice.10.bc using cuda.bindings.path_finder.
+    
+    Returns:
+        str: Path to libdevice.10.bc, or None if not found
+    """
+    try:
+        from cuda.bindings.path_finder import (
+            get_nvidia_libdevice_ctk,
+            get_libdevice_wheel,
+            get_debian_pkg_libdevice,
+        )
+        
+        for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]:
+            try:
+                result = getter()
+                if result is not None and result.info is not None:
+                    return result.info
+            except Exception:
+                continue
+        
+        return None
+    except ImportError:
+        import os
+        
+        # CUDA_HOME
+        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+        if cuda_home:
+            libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        # Linux paths
+        for base in ["/usr/local/cuda", "/opt/cuda"]:
+            libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        return None
+
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -352,6 +393,7 @@ class ProgramOptions:
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
+    use_libdevice: bool | None = None # Use libdevice
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -748,7 +790,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
-
+                    
+            self._use_libdevice = options.use_libdevice
             self._backend = "NVVM"
             self._linker = None
 
@@ -866,6 +909,19 @@ def compile(self, target_type, name_expressions=(), logs=None):
             nvvm = _get_nvvm_module()
             with _nvvm_exception_manager(self):
                 nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+                # Invoke libdevice
+                if getattr(self, '_use_libdevice', False):
+                    libdevice_path = _find_libdevice_path()
+                    if libdevice_path is None:
+                        raise RuntimeError(
+                            "use_libdevice=True but could not find libdevice.10.bc. "
+                            "Ensure CUDA toolkit is installed."
+                        )
+                    with open(libdevice_path, "rb") as f:
+                        libdevice_bc = f.read()
+                    # Use lazy_add_module for libdevice bitcode only following numba-cuda
+                    nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
+                
                 nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
 
             size = nvvm.get_compiled_result_size(self._mnff.handle)

From 25d336c1d7644b801d3730fce702a231175bde47 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:43:25 +0000
Subject: [PATCH 21/79] refresh

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 6 ++----
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx      | 4 +---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 785f587951..8025189b34 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -9,9 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t
 import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
-from cuda.bindings import path_finder
-
-
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -77,7 +75,7 @@ cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink")
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint
     return <void*>handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 15845e3228..5411e157c7 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -10,8 +10,6 @@ import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
-from cuda.bindings import path_finder
-
 
 ###############################################################################
 # Extern
@@ -76,7 +74,7 @@ cdef void* __nvvmGetProgramLog = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm")
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint
     return <void*>handle
 
 

From 32c1913af18f49e848bdeaf7e0abb20813005892 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:44:08 +0000
Subject: [PATCH 22/79] refresh

---
 cuda_bindings/cuda/bindings/path_finder.py | 37 ----------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/path_finder.py

diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py
deleted file mode 100644
index 12c02b930b..0000000000
--- a/cuda_bindings/cuda/bindings/path_finder.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-from cuda.bindings._path_finder.cuda_paths import (
-    get_conda_ctk,
-    get_conda_include_dir,
-    get_cuda_home,
-    get_cuda_paths,
-    get_current_cuda_target_name,
-    get_debian_pkg_libdevice,
-    get_libdevice_wheel,
-    get_nvidia_cudalib_ctk,
-    get_nvidia_libdevice_ctk,
-    get_nvidia_nvvm_ctk,
-    get_nvidia_static_cudalib_ctk,
-    get_system_ctk,
-)
-from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library
-from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library
-
-__all__ = [
-    "find_nvidia_dynamic_library",
-    "load_nvidia_dynamic_library",
-    "get_conda_ctk",
-    "get_conda_include_dir",
-    "get_cuda_home",
-    "get_cuda_paths",
-    "get_current_cuda_target_name",
-    "get_debian_pkg_libdevice",
-    "get_libdevice_wheel",
-    "get_nvidia_cudalib_ctk",
-    "get_nvidia_libdevice_ctk",
-    "get_nvidia_nvvm_ctk",
-    "get_nvidia_static_cudalib_ctk",
-    "get_system_ctk",
-]
\ No newline at end of file

From 01f03e552806f65ae27defb8e5ce32a1aa79e7cc Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 18:45:37 +0000
Subject: [PATCH 23/79] refresh

---
 .../cuda/bindings/_path_finder/cuda_paths.py  | 400 ------------------
 .../find_nvidia_dynamic_library.py            | 139 ------
 .../cuda/bindings/_path_finder/findlib.py     |  69 ---
 .../load_nvidia_dynamic_library.py            |  92 ----
 .../_path_finder/sys_path_find_sub_dirs.py    |  40 --
 5 files changed, 740 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
 delete mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py

diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
deleted file mode 100644
index c827699b3a..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
+++ /dev/null
@@ -1,400 +0,0 @@
-import os
-import platform
-import re
-import site
-import sys
-import traceback
-import warnings
-from collections import namedtuple
-from pathlib import Path
-
-from .findlib import find_file, find_lib
-
-IS_WIN32 = sys.platform.startswith("win32")
-
-_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
-
-
-def _get_numba_CUDA_INCLUDE_PATH():
-    # From numba/numba/core/config.py
-
-    def _readenv(name, ctor, default):
-        value = os.environ.get(name)
-        if value is None:
-            return default() if callable(default) else default
-        try:
-            return ctor(value)
-        except Exception:
-            warnings.warn(  # noqa: B028
-                f"Environment variable '{name}' is defined but "
-                f"its associated value '{value}' could not be "
-                "parsed.\nThe parse failed with exception:\n"
-                f"{traceback.format_exc()}",
-                RuntimeWarning,
-            )
-            return default
-
-    if IS_WIN32:
-        cuda_path = os.environ.get("CUDA_PATH")
-        if cuda_path:  # noqa: SIM108
-            default_cuda_include_path = os.path.join(cuda_path, "include")
-        else:
-            default_cuda_include_path = "cuda_include_not_found"
-    else:
-        default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include")
-    CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path)
-    return CUDA_INCLUDE_PATH
-
-
-config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH()
-
-
-def _find_valid_path(options):
-    """Find valid path from *options*, which is a list of 2-tuple of
-    (name, path).  Return first pair where *path* is not None.
-    If no valid path is found, return ('<unknown>', None)
-    """
-    for by, data in options:
-        if data is not None:
-            return by, data
-    else:
-        return "<unknown>", None
-
-
-def _get_libdevice_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
-        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
-        ("Debian package", get_debian_pkg_libdevice()),
-        ("NVIDIA NVCC Wheel", get_libdevice_wheel()),
-    ]
-    libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice")
-    if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir):
-        options.append(("System", libdevice_ctk_dir))
-
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _nvvm_lib_dir():
-    if IS_WIN32:
-        return "nvvm", "bin"
-    else:
-        return "nvvm", "lib64"
-
-
-def _get_nvvm_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
-        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
-        ("NVIDIA NVCC Wheel", _get_nvvm_wheel()),
-    ]
-    # need to ensure nvvm dir actually exists
-    nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir())
-    if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir):
-        options.append(("System", nvvm_ctk_dir))
-
-    by, path = _find_valid_path(options)
-    return by, path
-
-
-def _get_nvvm_wheel():
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
-    for sp in site_paths:
-        # The SONAME is taken based on public CTK 12.x releases
-        if sys.platform.startswith("linux"):
-            dso_dir = "lib64"
-            # Hack: libnvvm from Linux wheel
-            # does not have any soname (CUDAINST-3183)
-            dso_path = "libnvvm.so"
-        elif sys.platform.startswith("win32"):
-            dso_dir = "bin"
-            dso_path = "nvvm64_40_0.dll"
-        else:
-            raise AssertionError()
-
-        if sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-            if os.path.exists(dso_path):
-                return str(Path(dso_path).parent)
-
-
-def _get_libdevice_paths():
-    by, libdir = _get_libdevice_path_decision()
-    if by == "NVIDIA NVCC Wheel":
-        # The NVVM path is a directory, not a file
-        out = os.path.join(libdir, "libdevice.10.bc")
-    else:
-        # Search for pattern
-        pat = r"libdevice(\.\d+)*\.bc$"
-        candidates = find_file(re.compile(pat), libdir)
-        # Keep only the max (most recent version) of the bitcode files.
-        out = max(candidates, default=None)
-    return _env_path_tuple(by, out)
-
-
-def _cudalib_path():
-    if IS_WIN32:
-        return "bin"
-    else:
-        return "lib64"
-
-
-def _cuda_home_static_cudalib_path():
-    if IS_WIN32:
-        return ("lib", "x64")
-    else:
-        return ("lib64",)
-
-
-def _get_cudalib_dir_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
-        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
-        ("System", get_system_ctk(_cudalib_path())),
-    ]
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _get_static_cudalib_dir_path_decision():
-    options = [
-        ("Conda environment", get_conda_ctk()),
-        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
-        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
-        ("System", get_system_ctk(_cudalib_path())),
-    ]
-    by, libdir = _find_valid_path(options)
-    return by, libdir
-
-
-def _get_cudalib_dir():
-    by, libdir = _get_cudalib_dir_path_decision()
-    return _env_path_tuple(by, libdir)
-
-
-def _get_static_cudalib_dir():
-    by, libdir = _get_static_cudalib_dir_path_decision()
-    return _env_path_tuple(by, libdir)
-
-
-def get_system_ctk(*subdirs):
-    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
-    # Linux?
-    if sys.platform.startswith("linux"):
-        # Is cuda alias to /usr/local/cuda?
-        # We are intentionally not getting versioned cuda installation.
-        base = "/usr/local/cuda"
-        if os.path.exists(base):
-            return os.path.join(base, *subdirs)
-
-
-def get_conda_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit."""
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-    # Assume the existence of NVVM to imply cudatoolkit installed
-    paths = find_lib("nvvm")
-    if not paths:
-        return
-    # Use the directory name of the max path
-    return os.path.dirname(max(paths))
-
-
-def get_nvidia_nvvm_ctk():
-    """Return path to directory containing the NVVM shared library."""
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-
-    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
-    # conda package is installed.
-
-    # First, try the location used on Linux and the Windows 11.x packages
-    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
-    if not os.path.exists(libdir) or not os.path.isdir(libdir):
-        # If that fails, try the location used for Windows 12.x packages
-        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
-        if not os.path.exists(libdir) or not os.path.isdir(libdir):
-            # If that doesn't exist either, assume we don't have the NVIDIA
-            # conda package
-            return
-
-    paths = find_lib("nvvm", libdir=libdir)
-    if not paths:
-        return
-    # Use the directory name of the max path
-    return os.path.dirname(max(paths))
-
-
-def get_nvidia_libdevice_ctk():
-    """Return path to directory containing the libdevice library."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-    nvvm_dir = os.path.dirname(nvvm_ctk)
-    return os.path.join(nvvm_dir, "libdevice")
-
-
-def get_nvidia_cudalib_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
-    subdir = "bin" if IS_WIN32 else "lib"
-    return os.path.join(env_dir, subdir)
-
-
-def get_nvidia_static_cudalib_ctk():
-    """Return path to directory containing the static libraries of cudatoolkit."""
-    nvvm_ctk = get_nvidia_nvvm_ctk()
-    if not nvvm_ctk:
-        return
-
-    if IS_WIN32 and ("Library" not in nvvm_ctk):  # noqa: SIM108
-        # Location specific to CUDA 11.x packages on Windows
-        dirs = ("Lib", "x64")
-    else:
-        # Linux, or Windows with CUDA 12.x packages
-        dirs = ("lib",)
-
-    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
-    return os.path.join(env_dir, *dirs)
-
-
-def get_cuda_home(*subdirs):
-    """Get paths of CUDA_HOME.
-    If *subdirs* are the subdirectory name to be appended in the resulting
-    path.
-    """
-    cuda_home = os.environ.get("CUDA_HOME")
-    if cuda_home is None:
-        # Try Windows CUDA installation without Anaconda
-        cuda_home = os.environ.get("CUDA_PATH")
-    if cuda_home is not None:
-        return os.path.join(cuda_home, *subdirs)
-
-
-def _get_nvvm_path():
-    by, path = _get_nvvm_path_decision()
-    if by == "NVIDIA NVCC Wheel":
-        # The NVVM path is a directory, not a file
-        path = os.path.join(path, "libnvvm.so")
-    else:
-        candidates = find_lib("nvvm", path)
-        path = max(candidates) if candidates else None
-    return _env_path_tuple(by, path)
-
-
-def get_cuda_paths():
-    """Returns a dictionary mapping component names to a 2-tuple
-    of (source_variable, info).
-    The returned dictionary will have the following keys and infos:
-    - "nvvm": file_path
-    - "libdevice": List[Tuple[arch, file_path]]
-    - "cudalib_dir": directory_path
-    Note: The result of the function is cached.
-    """
-    # Check cache
-    if hasattr(get_cuda_paths, "_cached_result"):
-        return get_cuda_paths._cached_result
-    else:
-        # Not in cache
-        d = {
-            "nvvm": _get_nvvm_path(),
-            "libdevice": _get_libdevice_paths(),
-            "cudalib_dir": _get_cudalib_dir(),
-            "static_cudalib_dir": _get_static_cudalib_dir(),
-            "include_dir": _get_include_dir(),
-        }
-        # Cache result
-        get_cuda_paths._cached_result = d
-        return d
-
-
-def get_debian_pkg_libdevice():
-    """
-    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
-    exists.
-    """
-    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
-    if not os.path.exists(pkg_libdevice_location):
-        return None
-    return pkg_libdevice_location
-
-
-def get_libdevice_wheel():
-    nvvm_path = _get_nvvm_wheel()
-    if nvvm_path is None:
-        return None
-    nvvm_path = Path(nvvm_path)
-    libdevice_path = nvvm_path.parent / "libdevice"
-
-    return str(libdevice_path)
-
-
-def get_current_cuda_target_name():
-    """Determine conda's CTK target folder based on system and machine arch.
-    CTK's conda package delivers headers based on its architecture type. For example,
-    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
-    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
-    nuances at cudart's conda feedstock:
-    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
-    """
-    system = platform.system()
-    machine = platform.machine()
-
-    if system == "Linux":
-        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
-    elif system == "Windows":
-        arch_to_targets = {
-            "AMD64": "x64",
-        }
-    else:
-        arch_to_targets = {}
-
-    return arch_to_targets.get(machine, None)
-
-
-def get_conda_include_dir():
-    """
-    Return the include directory in the current conda environment, if one
-    is active and it exists.
-    """
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
-    if not is_conda_env:
-        return
-
-    if platform.system() == "Windows":
-        include_dir = os.path.join(sys.prefix, "Library", "include")
-    elif target_name := get_current_cuda_target_name():
-        include_dir = os.path.join(sys.prefix, "targets", target_name, "include")
-    else:
-        # A fallback when target cannot determined
-        # though usually it shouldn't.
-        include_dir = os.path.join(sys.prefix, "include")
-
-    if (
-        os.path.exists(include_dir)
-        and os.path.isdir(include_dir)
-        and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h"))
-    ):
-        return include_dir
-    return
-
-
-def _get_include_dir():
-    """Find the root include directory."""
-    options = [
-        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
-        ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH),
-        # TODO: add others
-    ]
-    by, include_dir = _find_valid_path(options)
-    return _env_path_tuple(by, include_dir)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
deleted file mode 100644
index d8413282e6..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import functools
-import glob
-import os
-
-from .cuda_paths import IS_WIN32, get_cuda_paths
-from .sys_path_find_sub_dirs import sys_path_find_sub_dirs
-
-
-def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments):
-    error_messages.append(f"No such file: {file_wild}")
-    for sub_dir in sys_path_find_sub_dirs(sub_dirs):
-        attachments.append(f'  listdir("{sub_dir}"):')
-        for node in sorted(os.listdir(sub_dir)):
-            attachments.append(f"    {node}")
-
-
-def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments):
-    if libname == "nvvm":  # noqa: SIM108
-        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64")
-    else:
-        nvidia_sub_dirs = ("nvidia", "*", "lib")
-    file_wild = so_basename + "*"
-    for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
-        # First look for an exact match
-        so_name = os.path.join(lib_dir, so_basename)
-        if os.path.isfile(so_name):
-            return so_name
-        # Look for a versioned library
-        # Using sort here mainly to make the result deterministic.
-        for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
-            so_name = os.path.join(lib_dir, node)
-            if os.path.isfile(so_name):
-                return so_name
-    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
-    return None
-
-
-def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments):
-    if libname == "nvvm":  # noqa: SIM108
-        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin")
-    else:
-        nvidia_sub_dirs = ("nvidia", "*", "bin")
-    file_wild = libname + "*.dll"
-    for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
-        for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))):
-            dll_name = os.path.join(bin_dir, node)
-            if os.path.isfile(dll_name):
-                return dll_name
-    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
-    return None
-
-
-def _get_cuda_paths_info(key, error_messages):
-    env_path_tuple = get_cuda_paths()[key]
-    if not env_path_tuple:
-        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]')
-        return None
-    if not env_path_tuple.info:
-        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info')
-        return None
-    return env_path_tuple.info
-
-
-def _find_so_using_cudalib_dir(so_basename, error_messages, attachments):
-    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
-    if cudalib_dir is None:
-        return None
-    primary_so_dir = cudalib_dir + "/"
-    candidate_so_dirs = [primary_so_dir]
-    libs = ["/lib/", "/lib64/"]
-    for _ in range(2):
-        alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1))
-        if alt_dir not in candidate_so_dirs:
-            candidate_so_dirs.append(alt_dir)
-        libs.reverse()
-    candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs]
-    error_messages = []
-    for so_name in candidate_so_names:
-        if os.path.isfile(so_name):
-            return so_name
-        error_messages.append(f"No such file: {so_name}")
-    for so_dirname in candidate_so_dirs:
-        attachments.append(f'  listdir("{so_dirname}"):')
-        if not os.path.isdir(so_dirname):
-            attachments.append("    DIRECTORY DOES NOT EXIST")
-        else:
-            for node in sorted(os.listdir(so_dirname)):
-                attachments.append(f"    {node}")
-    return None
-
-
-def _find_dll_using_cudalib_dir(libname, error_messages, attachments):
-    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
-    if cudalib_dir is None:
-        return None
-    file_wild = libname + "*.dll"
-    for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))):
-        dll_name = os.path.join(cudalib_dir, node)
-        if os.path.isfile(dll_name):
-            return dll_name
-    error_messages.append(f"No such file: {file_wild}")
-    attachments.append(f'  listdir("{cudalib_dir}"):')
-    for node in sorted(os.listdir(cudalib_dir)):
-        attachments.append(f"    {node}")
-    return None
-
-
-@functools.cache
-def find_nvidia_dynamic_library(name: str) -> str:
-    error_messages = []
-    attachments = []
-
-    if IS_WIN32:
-        dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments)
-        if dll_name is None:
-            if name == "nvvm":
-                dll_name = _get_cuda_paths_info("nvvm", error_messages)
-            else:
-                dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments)
-        if dll_name is None:
-            attachments = "\n".join(attachments)
-            raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}")
-        return dll_name
-
-    so_basename = f"lib{name}.so"
-    so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments)
-    if so_name is None:
-        if name == "nvvm":
-            so_name = _get_cuda_paths_info("nvvm", error_messages)
-        else:
-            so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments)
-    if so_name is None:
-        attachments = "\n".join(attachments)
-        raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}")
-    return so_name
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
deleted file mode 100644
index c64c3c9577..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/findlib.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Forked from:
-# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py
-
-import os
-import re
-import sys
-
-
-def get_lib_dirs():
-    """
-    Anaconda specific
-    """
-    if sys.platform == "win32":
-        # on windows, historically `DLLs` has been used for CUDA libraries,
-        # since approximately CUDA 9.2, `Library\bin` has been used.
-        dirnames = ["DLLs", os.path.join("Library", "bin")]
-    else:
-        dirnames = [
-            "lib",
-        ]
-    libdirs = [os.path.join(sys.prefix, x) for x in dirnames]
-    return libdirs
-
-
-DLLNAMEMAP = {
-    "linux": r"lib%(name)s\.so\.%(ver)s$",
-    "linux2": r"lib%(name)s\.so\.%(ver)s$",
-    "linux-static": r"lib%(name)s\.a$",
-    "darwin": r"lib%(name)s\.%(ver)s\.dylib$",
-    "win32": r"%(name)s%(ver)s\.dll$",
-    "win32-static": r"%(name)s\.lib$",
-    "bsd": r"lib%(name)s\.so\.%(ver)s$",
-}
-
-RE_VER = r"[0-9]*([_\.][0-9]+)*"
-
-
-def find_lib(libname, libdir=None, platform=None, static=False):
-    platform = platform or sys.platform
-    platform = "bsd" if "bsd" in platform else platform
-    if static:
-        platform = f"{platform}-static"
-    if platform not in DLLNAMEMAP:
-        # Return empty list if platform name is undefined.
-        # Not all platforms define their static library paths.
-        return []
-    pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER}
-    regex = re.compile(pat)
-    return find_file(regex, libdir)
-
-
-def find_file(pat, libdir=None):
-    if libdir is None:
-        libdirs = get_lib_dirs()
-    elif isinstance(libdir, str):
-        libdirs = [
-            libdir,
-        ]
-    else:
-        libdirs = list(libdir)
-    files = []
-    for ldir in libdirs:
-        try:
-            entries = os.listdir(ldir)
-        except FileNotFoundError:
-            continue
-        candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)]
-        files.extend([c for c in candidates if os.path.isfile(c)])
-    return files
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
deleted file mode 100644
index 69aadabcbf..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import functools
-import sys
-
-if sys.platform == "win32":
-    import ctypes.wintypes
-
-    import pywintypes
-    import win32api
-
-    # Mirrors WinBase.h (unfortunately not defined already elsewhere)
-    _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
-
-else:
-    import ctypes
-    import os
-
-    _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL
-
-from .find_nvidia_dynamic_library import find_nvidia_dynamic_library
-
-
-@functools.cache
-def _windows_cuDriverGetVersion() -> int:
-    handle = win32api.LoadLibrary("nvcuda.dll")
-
-    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
-    GetProcAddress = kernel32.GetProcAddress
-    GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR]
-    GetProcAddress.restype = ctypes.c_void_p
-    cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion")
-    assert cuDriverGetVersion
-
-    FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int))
-    cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion)
-    driver_ver = ctypes.c_int()
-    err = cuDriverGetVersion_fn(ctypes.byref(driver_ver))
-    assert err == 0
-    return driver_ver.value
-
-
-@functools.cache
-def _windows_load_with_dll_basename(name: str) -> int:
-    driver_ver = _windows_cuDriverGetVersion()
-    del driver_ver  # Keeping this here because it will probably be needed in the future.
-
-    if name == "nvJitLink":
-        dll_name = "nvJitLink_120_0.dll"
-    elif name == "nvrtc":
-        dll_name = "nvrtc64_120_0.dll"
-    elif name == "nvvm":
-        dll_name = "nvvm64_40_0.dll"
-
-    try:
-        return win32api.LoadLibrary(dll_name)
-    except pywintypes.error:
-        pass
-
-    return None
-
-
-@functools.cache
-def load_nvidia_dynamic_library(name: str) -> int:
-    # First try using the platform-specific dynamic loader search mechanisms
-    if sys.platform == "win32":
-        handle = _windows_load_with_dll_basename(name)
-        if handle:
-            return handle
-    else:
-        dl_path = f"lib{name}.so"  # Version intentionally no specified.
-        try:
-            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
-        except OSError:
-            pass
-        else:
-            # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
-            return handle._handle  # C unsigned int
-
-    dl_path = find_nvidia_dynamic_library(name)
-    if sys.platform == "win32":
-        try:
-            handle = win32api.LoadLibrary(dl_path)
-        except pywintypes.error as e:
-            raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e
-        # Use `cdef void* ptr = <void*><intptr_t>` in cython to convert back to void*
-        return handle  # C signed int, matches win32api.GetProcAddress
-    else:
-        try:
-            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
-        except OSError as e:
-            raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e
-        # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
-        return handle._handle  # C unsigned int
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
deleted file mode 100644
index 324cdeec30..0000000000
--- a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import functools
-import os
-import sys
-
-
-@functools.cache
-def _impl(sys_path, sub_dirs):
-    results = []
-    for base in sys_path:
-        stack = [(base, 0)]  # (current_path, index into sub_dirs)
-        while stack:
-            current_path, idx = stack.pop()
-            if idx == len(sub_dirs):
-                if os.path.isdir(current_path):
-                    results.append(current_path)
-                continue
-
-            sub = sub_dirs[idx]
-            if sub == "*":
-                try:
-                    entries = sorted(os.listdir(current_path))
-                except OSError:
-                    continue
-                for entry in entries:
-                    entry_path = os.path.join(current_path, entry)
-                    if os.path.isdir(entry_path):
-                        stack.append((entry_path, idx + 1))
-            else:
-                next_path = os.path.join(current_path, sub)
-                if os.path.isdir(next_path):
-                    stack.append((next_path, idx + 1))
-    return results
-
-
-def sys_path_find_sub_dirs(sub_dirs):
-    return _impl(tuple(sys.path), tuple(sub_dirs))
\ No newline at end of file

From 9a5d5fea346410987266e9d85f32c84046306ad1 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 19 Dec 2025 19:43:20 +0000
Subject: [PATCH 24/79] use cuda_pathfinder module for libdevice

---
 cuda_core/cuda/core/experimental/_program.py  |  43 +-----
 cuda_pathfinder/cuda/pathfinder/__init__.py   |   5 +
 .../_dynamic_libs/find_libdevice.py           | 146 ++++++++++++++++++
 3 files changed, 154 insertions(+), 40 deletions(-)
 create mode 100644 cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index a5ad2a3be2..fedafbc4fb 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -103,46 +103,9 @@ def _get_nvvm_module():
         raise e
 
 def _find_libdevice_path():
-    """
-    Find libdevice.10.bc using cuda.bindings.path_finder.
-    
-    Returns:
-        str: Path to libdevice.10.bc, or None if not found
-    """
-    try:
-        from cuda.bindings.path_finder import (
-            get_nvidia_libdevice_ctk,
-            get_libdevice_wheel,
-            get_debian_pkg_libdevice,
-        )
-        
-        for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]:
-            try:
-                result = getter()
-                if result is not None and result.info is not None:
-                    return result.info
-            except Exception:
-                continue
-        
-        return None
-    except ImportError:
-        import os
-        
-        # CUDA_HOME
-        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
-        if cuda_home:
-            libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc")
-            if os.path.isfile(libdevice_path):
-                return libdevice_path
-        
-        # Linux paths
-        for base in ["/usr/local/cuda", "/opt/cuda"]:
-            libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc")
-            if os.path.isfile(libdevice_path):
-                return libdevice_path
-        
-        return None
-
+    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 143c4b45cc..8820917318 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -6,6 +6,11 @@
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
 from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
+from cuda.pathfinder._dynamic_libs.find_libdevice import (
+    LibdeviceNotFoundError as LibdeviceNotFoundError,
+    find_libdevice as find_libdevice,
+    get_libdevice_path as get_libdevice_path,
+)
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES,  # noqa: F401
 )
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
new file mode 100644
index 0000000000..f1c0763765
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import functools
+import glob
+import os
+
+from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
+from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
+
+# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
+SITE_PACKAGES_LIBDEVICE_DIRS = (
+    "nvidia/cuda_nvvm/nvvm/libdevice",   # CTK 13+
+    "nvidia/cuda_nvcc/nvvm/libdevice",   # CTK <13
+)
+
+
+class LibdeviceNotFoundError(Exception):
+    pass
+
+
+def _get_cuda_home_or_path() -> str | None:
+    return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+
+
+def _no_such_file_in_dir(
+    dir_path: str, filename: str, error_messages: list[str], attachments: list[str]
+) -> None:
+    error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
+    if os.path.isdir(dir_path):
+        attachments.append(f'  listdir("{dir_path}"):')
+        for node in sorted(os.listdir(dir_path)):
+            attachments.append(f"    {node}")
+    else:
+        attachments.append(f'  Directory does not exist: "{dir_path}"')
+
+
+class _FindLibdevice:
+    FILENAME = "libdevice.10.bc"
+    REL_PATH = os.path.join("nvvm", "libdevice")
+    
+    def __init__(self):
+        self.error_messages: list[str] = []
+        self.attachments: list[str] = []
+        self.abs_path: str | None = None
+    
+    def try_site_packages(self) -> str | None:
+        for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
+            sub_dir = tuple(rel_dir.split("/"))
+            for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
+                file_path = os.path.join(abs_dir, self.FILENAME)
+                if os.path.isfile(file_path):
+                    return file_path
+        return None
+    
+    def try_with_conda_prefix(self) -> str | None:
+        conda_prefix = os.environ.get("CONDA_PREFIX")
+        if not conda_prefix:
+            return None
+        
+        anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
+        file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME)
+        if os.path.isfile(file_path):
+            return file_path
+        return None
+    
+    def try_with_cuda_home(self) -> str | None:
+        cuda_home = _get_cuda_home_or_path()
+        if cuda_home is None:
+            self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
+            return None
+        
+        file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME)
+        if os.path.isfile(file_path):
+            return file_path
+        
+        _no_such_file_in_dir(
+            os.path.join(cuda_home, self.REL_PATH),
+            self.FILENAME,
+            self.error_messages,
+            self.attachments,
+        )
+        return None
+    
+    def try_common_paths(self) -> str | None:
+        if IS_WINDOWS:
+            bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+        else:
+            bases = ["/usr/local/cuda", "/opt/cuda"]
+        
+        for base in bases:
+            # Direct path
+            file_path = os.path.join(base, self.REL_PATH, self.FILENAME)
+            if os.path.isfile(file_path):
+                return file_path
+            # Versioned paths (e.g., /usr/local/cuda-13.0)
+            for versioned in sorted(glob.glob(base + "*"), reverse=True):
+                if os.path.isdir(versioned):
+                    file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME)
+                    if os.path.isfile(file_path):
+                        return file_path
+        return None
+    
+    def raise_not_found_error(self) -> None:
+        err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
+        att = "\n".join(self.attachments) if self.attachments else ""
+        raise LibdeviceNotFoundError(
+            f'Failure finding "{self.FILENAME}": {err}\n{att}'
+        )
+
+
+@functools.cache
+def find_libdevice() -> str:
+    """Find the path to libdevice.10.bc.
+    Raises:
+        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+    """
+    finder = _FindLibdevice()
+    
+    abs_path = finder.try_site_packages()
+    if abs_path is None:
+        abs_path = finder.try_with_conda_prefix()
+    if abs_path is None:
+        abs_path = finder.try_with_cuda_home()
+    if abs_path is None:
+        abs_path = finder.try_common_paths()
+    
+    if abs_path is None:
+        finder.raise_not_found_error()
+    
+    return abs_path
+
+
+def get_libdevice_path() -> str | None:
+    """Get the path to libdevice.10.bc, or None if not found."""
+    finder = _FindLibdevice()
+    
+    abs_path = finder.try_site_packages()
+    if abs_path is None:
+        abs_path = finder.try_with_conda_prefix()
+    if abs_path is None:
+        abs_path = finder.try_with_cuda_home()
+    if abs_path is None:
+        abs_path = finder.try_common_paths()
+    
+    return abs_path
+

From 07c619958cbdbdda7360837ac6c982cbefb3b2f6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 5 Feb 2026 17:37:45 +0000
Subject: [PATCH 25/79] rebase

---
 cuda_core/cuda/core/experimental/_program.py | 46 +++++++++-----------
 cuda_core/tests/test_program.py              | 32 +++++++-------
 2 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index fedafbc4fb..47b78fd831 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -13,11 +13,11 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     _handle_boolean_option,
     check_or_create_options,
@@ -102,10 +102,6 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
-def _find_libdevice_path():
-    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
-    from cuda.pathfinder import get_libdevice_path
-    return get_libdevice_path()
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -118,7 +114,11 @@ def _process_define_macro_inner(formatted_options, macro):
         return True
     return False
 
-
+def _find_libdevice_path():
+    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
+    
 def _process_define_macro(formatted_options, macro):
     union_type = "Union[str, tuple[str, str]]"
     if _process_define_macro_inner(formatted_options, macro):
@@ -236,13 +236,9 @@ class ProgramOptions:
     no_display_error_number : bool, optional
         Disable the display of a diagnostic number for warning messages.
         Default: False
-    diag_error : Union[int, list[int]], optional
-        Emit error for a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_suppress : Union[int, list[int]], optional
-        Suppress a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_warn : Union[int, list[int]], optional
+    diag_error: Union[int, list[int], tuple[int]] | None = None
+    diag_suppress: Union[int, list[int], tuple[int]] | None = None
+    diag_warn: Union[int, list[int], tuple[int]] | None = None
         Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
     brief_diagnostics : bool, optional
@@ -339,7 +335,6 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
-    # Creating as 2 tuples ((names, source), (names,source))
     extra_sources: (
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
@@ -356,7 +351,7 @@ class ProgramOptions:
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-    use_libdevice: bool | None = None # Use libdevice
+    use_libdevice: bool | None = None # For libdevice execution 
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -501,7 +496,7 @@ def _prepare_nvrtc_options(self) -> list[bytes]:
             options.append("--numba-debug")
         return [o.encode() for o in options]
 
-    def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
+     def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
         options = []
 
         # Options supported by NVVM
@@ -690,11 +685,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"
@@ -704,7 +699,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the PTX backend.")
-
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -872,7 +866,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
             nvvm = _get_nvvm_module()
             with _nvvm_exception_manager(self):
                 nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-                # Invoke libdevice
+                # libdevice compilation 
                 if getattr(self, '_use_libdevice', False):
                     libdevice_path = _find_libdevice_path()
                     if libdevice_path is None:
@@ -882,7 +876,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                         )
                     with open(libdevice_path, "rb") as f:
                         libdevice_bc = f.read()
-                    # Use lazy_add_module for libdevice bitcode only following numba-cuda
+                    # libdevice for numba-cuda
                     nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
                 
                 nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
@@ -923,4 +917,4 @@ def handle(self) -> ProgramHandleT:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Program.handle)``.
         """
-        return self._mnff.handle
+        return self._mnff.handle
\ No newline at end of file
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 00d0709870..c8effbb825 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -6,11 +6,11 @@
 import warnings
 
 import pytest
-from cuda.core.experimental import _linker
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core import _linker
+from cuda.core._device import Device
+from cuda.core._module import Kernel, ObjectCode
+from cuda.core._program import Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -21,12 +21,11 @@
     _test_helpers_available = True
 except ImportError:
     _test_helpers_available = False
-
-
+    
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -39,7 +38,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -49,6 +48,7 @@ def _is_nvvm_available():
 def _get_nvrtc_version_for_tests():
     """
     Get NVRTC version.
+
     Returns:
         int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
         None: If NVRTC is not available
@@ -60,6 +60,7 @@ def _get_nvrtc_version_for_tests():
     except Exception:
         return None
 
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -97,7 +98,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -145,7 +146,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -340,7 +341,7 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path):
 
 @pytest.mark.parametrize("options", options)
 def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options):
-    program = Program(ptx_code_object._module.decode(), "ptx", options=options)
+    program = Program(ptx_code_object.code.decode(), "ptx", options=options)
     assert program.backend == ("driver" if is_culink_backend else "nvJitLink")
     program.compile("cubin")
     program.close()
@@ -383,7 +384,7 @@ def test_program_compile_valid_target_type(init_cuda):
         ptx_kernel = ptx_object_code.get_kernel("my_kernel")
         assert isinstance(ptx_kernel, Kernel)
 
-    program = Program(ptx_object_code._module.decode(), "ptx", options={"name": "24"})
+    program = Program(ptx_object_code.code.decode(), "ptx", options={"name": "24"})
     cubin_object_code = program.compile("cubin")
     assert isinstance(cubin_object_code, ObjectCode)
     assert cubin_object_code.name == "24"
@@ -420,7 +421,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
@@ -689,6 +690,7 @@ def test_cpp_program_with_extra_sources():
     options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
+        
 def test_program_options_as_bytes_nvrtc():
     """Test ProgramOptions.as_bytes() for NVRTC backend"""
     options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True)

From e1b19cc8f51e754a7b997c6a5efc69f1514a4c83 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 5 Feb 2026 17:42:05 +0000
Subject: [PATCH 26/79] rebase

---
 cuda_core/cuda/core/{experimental => }/_program.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cuda_core/cuda/core/{experimental => }/_program.py (100%)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_program.py
rename to cuda_core/cuda/core/_program.py

From dcdd1009bfdf4f998010f065a892746f32444fd5 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 6 Feb 2026 12:22:31 -0800
Subject: [PATCH 27/79] tests

---
 cuda_pathfinder/tests/test_find_libdevice.py | 94 ++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 cuda_pathfinder/tests/test_find_libdevice.py

diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
new file mode 100644
index 0000000000..2d24f397fe
--- /dev/null
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+
+from cuda.pathfinder import find_libdevice
+from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module
+
+FILENAME = "libdevice.10.bc"
+
+SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
+SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
+
+
+@pytest.fixture
+def clear_find_libdevice_cache():
+    find_libdevice.cache_clear()
+    yield
+    find_libdevice.cache_clear()
+
+
+def _make_libdevice_file(dir_path: str) -> str:
+    os.makedirs(dir_path, exist_ok=True)
+    file_path = os.path.join(dir_path, FILENAME)
+    with open(file_path, "wb"):
+        pass
+    return file_path
+
+
+@pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
+    libdevice_dir = tmp_path.joinpath(*rel_dir.split("/"))
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[str(libdevice_dir)],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)
+
+
+# same for cu12/cu13
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False)
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[],
+    )
+    monkeypatch.setenv("CONDA_PREFIX", str(tmp_path))
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)
+
+
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.setenv("CUDA_HOME", str(tmp_path))
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice()
+
+    assert result == expected_path
+    assert os.path.isfile(result)

From aca2e361a4d503b7e289dd31db86842a8fff1384 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 11 Feb 2026 07:42:44 -0800
Subject: [PATCH 28/79] Address reviews

---
 cuda_core/tests/test_program.py               | 12 +--
 .../_dynamic_libs/find_libdevice.py           | 97 ++++++++-----------
 2 files changed, 43 insertions(+), 66 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c8effbb825..bf099c860a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -15,13 +15,7 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
-try:
-    from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir
 
-    _test_helpers_available = True
-except ImportError:
-    _test_helpers_available = False
-    
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -658,7 +652,6 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
@@ -690,7 +683,8 @@ def test_cpp_program_with_extra_sources():
     options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
-        
+
+
 def test_program_options_as_bytes_nvrtc():
     """Test ProgramOptions.as_bytes() for NVRTC backend"""
     options = ProgramOptions(arch="sm_80", debug=True, lineinfo=True, ftz=True)
@@ -738,4 +732,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     """Test that unsupported options raise CUDAError for NVVM backend"""
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
-        options.as_bytes("nvvm")
\ No newline at end of file
+        options.as_bytes("nvvm")
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
index f1c0763765..29296cfbde 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
@@ -1,30 +1,27 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import functools
 import glob
 import os
 
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
+from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cuda_nvvm/nvvm/libdevice",   # CTK 13+
-    "nvidia/cuda_nvcc/nvvm/libdevice",   # CTK <13
+    "nvidia/cuda_nvvm/nvvm/libdevice",  # CTK 13+
+    "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 
-
-class LibdeviceNotFoundError(Exception):
-    pass
+FILENAME = "libdevice.10.bc"
 
 
-def _get_cuda_home_or_path() -> str | None:
-    return os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+class LibdeviceNotFoundError(RuntimeError):
+    pass
 
 
-def _no_such_file_in_dir(
-    dir_path: str, filename: str, error_messages: list[str], attachments: list[str]
-) -> None:
+def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
     if os.path.isdir(dir_path):
         attachments.append(f'  listdir("{dir_path}"):')
@@ -35,105 +32,80 @@ def _no_such_file_in_dir(
 
 
 class _FindLibdevice:
-    FILENAME = "libdevice.10.bc"
     REL_PATH = os.path.join("nvvm", "libdevice")
-    
-    def __init__(self):
+
+    def __init__(self) -> None:
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
         self.abs_path: str | None = None
-    
+
     def try_site_packages(self) -> str | None:
         for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
             sub_dir = tuple(rel_dir.split("/"))
             for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
-                file_path = os.path.join(abs_dir, self.FILENAME)
+                file_path = os.path.join(abs_dir, FILENAME)
                 if os.path.isfile(file_path):
                     return file_path
         return None
-    
+
     def try_with_conda_prefix(self) -> str | None:
         conda_prefix = os.environ.get("CONDA_PREFIX")
         if not conda_prefix:
             return None
-        
+
         anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
-        file_path = os.path.join(anchor, self.REL_PATH, self.FILENAME)
+        file_path = os.path.join(anchor, self.REL_PATH, FILENAME)
         if os.path.isfile(file_path):
             return file_path
         return None
-    
+
     def try_with_cuda_home(self) -> str | None:
-        cuda_home = _get_cuda_home_or_path()
+        cuda_home = get_cuda_home_or_path()
         if cuda_home is None:
             self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
             return None
-        
-        file_path = os.path.join(cuda_home, self.REL_PATH, self.FILENAME)
+
+        file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME)
         if os.path.isfile(file_path):
             return file_path
-        
+
         _no_such_file_in_dir(
             os.path.join(cuda_home, self.REL_PATH),
-            self.FILENAME,
+            FILENAME,
             self.error_messages,
             self.attachments,
         )
         return None
-    
+
     def try_common_paths(self) -> str | None:
         if IS_WINDOWS:
             bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
         else:
             bases = ["/usr/local/cuda", "/opt/cuda"]
-        
+
         for base in bases:
             # Direct path
-            file_path = os.path.join(base, self.REL_PATH, self.FILENAME)
+            file_path = os.path.join(base, self.REL_PATH, FILENAME)
             if os.path.isfile(file_path):
                 return file_path
             # Versioned paths (e.g., /usr/local/cuda-13.0)
             for versioned in sorted(glob.glob(base + "*"), reverse=True):
                 if os.path.isdir(versioned):
-                    file_path = os.path.join(versioned, self.REL_PATH, self.FILENAME)
+                    file_path = os.path.join(versioned, self.REL_PATH, FILENAME)
                     if os.path.isfile(file_path):
                         return file_path
         return None
-    
+
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise LibdeviceNotFoundError(
-            f'Failure finding "{self.FILENAME}": {err}\n{att}'
-        )
-
-
-@functools.cache
-def find_libdevice() -> str:
-    """Find the path to libdevice.10.bc.
-    Raises:
-        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
-    """
-    finder = _FindLibdevice()
-    
-    abs_path = finder.try_site_packages()
-    if abs_path is None:
-        abs_path = finder.try_with_conda_prefix()
-    if abs_path is None:
-        abs_path = finder.try_with_cuda_home()
-    if abs_path is None:
-        abs_path = finder.try_common_paths()
-    
-    if abs_path is None:
-        finder.raise_not_found_error()
-    
-    return abs_path
+        raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
 
 
 def get_libdevice_path() -> str | None:
     """Get the path to libdevice.10.bc, or None if not found."""
     finder = _FindLibdevice()
-    
+
     abs_path = finder.try_site_packages()
     if abs_path is None:
         abs_path = finder.try_with_conda_prefix()
@@ -141,6 +113,17 @@ def get_libdevice_path() -> str | None:
         abs_path = finder.try_with_cuda_home()
     if abs_path is None:
         abs_path = finder.try_common_paths()
-    
+
     return abs_path
 
+
+@functools.cache
+def find_libdevice() -> str:
+    """Find the path to libdevice*.bc.
+    Raises:
+        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+    """
+    path_or_none = get_libdevice_path()
+    if path_or_none is None:
+        raise LibdeviceNotFoundError(f"{FILENAME} not found")
+    return path_or_none

From af6e70ac8e3e92c8be073ce2b362763fa7f5a087 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 11 Feb 2026 07:50:35 -0800
Subject: [PATCH 29/79] put libdevice stuff under _static_libs

---
 cuda_pathfinder/cuda/pathfinder/__init__.py      | 16 ++++++++++------
 .../find_libdevice.py                            |  0
 cuda_pathfinder/tests/test_find_libdevice.py     |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)
 rename cuda_pathfinder/cuda/pathfinder/{_dynamic_libs => _static_libs}/find_libdevice.py (100%)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 3b2d44eb5d..ece060d81a 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """cuda.pathfinder public APIs"""
@@ -10,11 +10,6 @@
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
 from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
-from cuda.pathfinder._dynamic_libs.find_libdevice import (
-    LibdeviceNotFoundError as LibdeviceNotFoundError,
-    find_libdevice as find_libdevice,
-    get_libdevice_path as get_libdevice_path,
-)
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES,  # noqa: F401
 )
@@ -24,6 +19,15 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
+from cuda.pathfinder._static_libs.find_libdevice import (
+    LibdeviceNotFoundError as LibdeviceNotFoundError,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
+    find_libdevice as find_libdevice,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
+    get_libdevice_path as get_libdevice_path,
+)
 
 from cuda.pathfinder._version import __version__  # isort: skip  # noqa: F401
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
similarity index 100%
rename from cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_libdevice.py
rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index 2d24f397fe..a1dfc0b64f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -6,7 +6,7 @@
 import pytest
 
 from cuda.pathfinder import find_libdevice
-from cuda.pathfinder._dynamic_libs import find_libdevice as find_libdevice_module
+from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module
 
 FILENAME = "libdevice.10.bc"
 

From b1d423f591426283f7948237684fa6f1aff6c560 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 16:32:14 +0000
Subject: [PATCH 30/79] refresh reviews

---
 .../pathfinder/_static_libs/find_libdevice.py | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 29296cfbde..0cfcd4e493 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -7,6 +7,7 @@
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
@@ -15,11 +16,10 @@
 )
 
 FILENAME = "libdevice.10.bc"
-
-
-class LibdeviceNotFoundError(RuntimeError):
-    pass
-
+if IS_WINDOWS:
+    bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+else:
+    bases = ["/usr/local/cuda", "/opt/cuda"]
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
@@ -78,10 +78,6 @@ def try_with_cuda_home(self) -> str | None:
         return None
 
     def try_common_paths(self) -> str | None:
-        if IS_WINDOWS:
-            bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-        else:
-            bases = ["/usr/local/cuda", "/opt/cuda"]
 
         for base in bases:
             # Direct path
@@ -99,11 +95,11 @@ def try_common_paths(self) -> str | None:
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise LibdeviceNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
+        raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
 
 
 def get_libdevice_path() -> str | None:
-    """Get the path to libdevice.10.bc, or None if not found."""
+    """Get the path to libdevice*.bc, or None if not found."""
     finder = _FindLibdevice()
 
     abs_path = finder.try_site_packages()
@@ -121,9 +117,9 @@ def get_libdevice_path() -> str | None:
 def find_libdevice() -> str:
     """Find the path to libdevice*.bc.
     Raises:
-        LibdeviceNotFoundError: If libdevice.10.bc cannot be found
+        DynamicLibNotFoundError: If libdevice.10.bc cannot be found
     """
     path_or_none = get_libdevice_path()
     if path_or_none is None:
-        raise LibdeviceNotFoundError(f"{FILENAME} not found")
+        raise DynamicLibNotFoundError(f"{FILENAME} not found")
     return path_or_none

From 4cedbb7c2bb6790c6f8e4f7a60319c28663d412c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 17:37:12 +0000
Subject: [PATCH 31/79] change program to cython per PR 1565

---
 cuda_core/cuda/core/_program.pxd |   17 +
 cuda_core/cuda/core/_program.py  |  925 --------------------------
 cuda_core/cuda/core/_program.pyx | 1038 ++++++++++++++++++++++++++++++
 3 files changed, 1055 insertions(+), 925 deletions(-)
 create mode 100644 cuda_core/cuda/core/_program.pxd
 delete mode 100644 cuda_core/cuda/core/_program.py
 create mode 100644 cuda_core/cuda/core/_program.pyx

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
new file mode 100644
index 0000000000..d4abe85ff8
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pxd
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._resource_handles cimport NvrtcProgramHandle, NvvmProgramHandle
+
+
+cdef class Program:
+    cdef:
+        NvrtcProgramHandle _h_nvrtc
+        NvvmProgramHandle _h_nvvm
+        str _backend
+        object _linker  # Linker
+        object _options  # ProgramOptions
+        object __weakref__
+        bint _use_libdevice      # Flag for libdevice loading
+        int _module_count  
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
deleted file mode 100644
index 34eac95270..0000000000
--- a/cuda_core/cuda/core/_program.py
+++ /dev/null
@@ -1,925 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import weakref
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
-from warnings import warn
-
-if TYPE_CHECKING:
-    import cuda.bindings
-
-from cuda.core._device import Device
-from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core._module import ObjectCode
-from cuda.core._utils.clear_error_support import assert_type
-from cuda.core._utils.cuda_utils import (
-    CUDAError,
-    _handle_boolean_option,
-    check_or_create_options,
-    driver,
-    get_binding_version,
-    handle_return,
-    is_nested_sequence,
-    is_sequence,
-    nvrtc,
-)
-
-
-@contextmanager
-def _nvvm_exception_manager(self):
-    """
-    Taken from _linker.py
-    """
-    try:
-        yield
-    except Exception as e:
-        error_log = ""
-        if hasattr(self, "_mnff"):
-            try:
-                nvvm = _get_nvvm_module()
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    error_log = log.decode("utf-8", errors="backslashreplace")
-            except Exception:
-                error_log = ""
-        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.10...
-        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
-        raise e
-
-
-_nvvm_module = None
-_nvvm_import_attempted = False
-
-
-def _get_nvvm_module():
-    """
-    Handles the import of NVVM module with version and availability checks.
-    NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where:
-    1. cuda.bindings is not new enough (< 12.9.0)
-    2. libnvvm is not found in the Python environment
-
-    Returns:
-        The nvvm module if available and working
-
-    Raises:
-        RuntimeError: If NVVM is not available due to version or library issues
-    """
-    global _nvvm_module, _nvvm_import_attempted
-
-    if _nvvm_import_attempted:
-        if _nvvm_module is None:
-            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
-        return _nvvm_module
-
-    _nvvm_import_attempted = True
-
-    try:
-        version = get_binding_version()
-        if version < (12, 9):
-            raise RuntimeError(
-                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
-                "Please update cuda-bindings to use NVVM features."
-            )
-
-        from cuda.bindings import nvvm
-        from cuda.bindings._internal.nvvm import _inspect_function_pointer
-
-        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
-
-        _nvvm_module = nvvm
-        return _nvvm_module
-
-    except RuntimeError as e:
-        _nvvm_module = None
-        raise e
-
-
-def _process_define_macro_inner(formatted_options, macro):
-    if isinstance(macro, str):
-        formatted_options.append(f"--define-macro={macro}")
-        return True
-    if isinstance(macro, tuple):
-        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
-            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
-        formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
-        return True
-    return False
-
-def _find_libdevice_path():
-    """Find libdevice.10.bc for NVVM compilation using cuda.pathfinder."""
-    from cuda.pathfinder import get_libdevice_path
-    return get_libdevice_path()
-    
-def _process_define_macro(formatted_options, macro):
-    union_type = "Union[str, tuple[str, str]]"
-    if _process_define_macro_inner(formatted_options, macro):
-        return
-    if is_nested_sequence(macro):
-        for seq_macro in macro:
-            if not _process_define_macro_inner(formatted_options, seq_macro):
-                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
-        return
-    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
-
-
-@dataclass
-class ProgramOptions:
-    """Customizable options for configuring `Program`.
-
-    Attributes
-    ----------
-    name : str, optional
-        Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
-    arch : str, optional
-        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
-        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
-        will be used.
-    relocatable_device_code : bool, optional
-        Enable (disable) the generation of relocatable device code.
-        Default: False
-    extensible_whole_program : bool, optional
-        Do extensible whole program compilation of device code.
-        Default: False
-    debug : bool, optional
-        Generate debug information. If --dopt is not specified, then turns off all optimizations.
-        Default: False
-    lineinfo: bool, optional
-        Generate line-number information.
-        Default: False
-    device_code_optimize : bool, optional
-        Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
-        for optimized device code.
-        Default: None
-    ptxas_options : Union[str, list[str]], optional
-        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
-        For example ["-v", "-O2"].
-        Default: None
-    max_register_count : int, optional
-        Specify the maximum amount of registers that GPU functions can use.
-        Default: None
-    ftz : bool, optional
-        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
-        values.
-        Default: False
-    prec_sqrt : bool, optional
-        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
-        Default: True
-    prec_div : bool, optional
-        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
-        approximation.
-        Default: True
-    fma : bool, optional
-        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
-        multiply-add operations.
-        Default: True
-    use_fast_math : bool, optional
-        Make use of fast math operations.
-        Default: False
-    extra_device_vectorization : bool, optional
-        Enables more aggressive device code vectorization in the NVVM optimizer.
-        Default: False
-    link_time_optimization : bool, optional
-        Generate intermediate code for later link-time optimization.
-        Default: False
-    gen_opt_lto : bool, optional
-        Run the optimizer passes before generating the LTO IR.
-        Default: False
-    define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
-        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
-        strings, in which case the first element is defined as the second, or a list of strings or tuples.
-        Default: None
-    undefine_macro : Union[str, list[str]], optional
-        Cancel any previous definition of a macro, or list of macros.
-        Default: None
-    include_path : Union[str, list[str]], optional
-        Add the directory or directories to the list of directories to be searched for headers.
-        Default: None
-    pre_include : Union[str, list[str]], optional
-        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
-        Default: None
-    no_source_include : bool, optional
-        Disable the default behavior of adding the directory of each input source to the include path.
-        Default: False
-    std : str, optional
-        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
-        Default: c++17
-    builtin_move_forward : bool, optional
-        Provide builtin definitions of std::move and std::forward.
-        Default: True
-    builtin_initializer_list : bool, optional
-        Provide builtin definitions of std::initializer_list class and member functions.
-        Default: True
-    disable_warnings : bool, optional
-        Inhibit all warning messages.
-        Default: False
-    restrict : bool, optional
-        Programmer assertion that all kernel pointer parameters are restrict pointers.
-        Default: False
-    device_as_default_execution_space : bool, optional
-        Treat entities with no execution space annotation as __device__ entities.
-        Default: False
-    device_int128 : bool, optional
-        Allow the __int128 type in device code.
-        Default: False
-    optimization_info : str, optional
-        Provide optimization reports for the specified kind of optimization.
-        Default: None
-    no_display_error_number : bool, optional
-        Disable the display of a diagnostic number for warning messages.
-        Default: False
-    diag_error : Union[int, list[int]], optional
-        Emit error for a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_suppress : Union[int, list[int]], optional
-        Suppress a specified diagnostic message number or comma separated list of numbers.
-        Default: None
-    diag_warn : Union[int, list[int]], optional
-        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
-        Default: None
-    brief_diagnostics : bool, optional
-        Disable or enable showing source line and column info in a diagnostic.
-        Default: False
-    time : str, optional
-        Generate a CSV table with the time taken by each compilation phase.
-        Default: None
-    split_compile : int, optional
-        Perform compiler optimizations in parallel.
-        Default: 1
-    fdevice_syntax_only : bool, optional
-        Ends device compilation after front-end syntax checking.
-        Default: False
-    minimal : bool, optional
-        Omit certain language features to reduce compile time for small programs.
-        Default: False
-    no_cache : bool, optional
-        Disable compiler caching.
-        Default: False
-    fdevice_time_trace : str, optional
-        Generate time trace JSON for profiling compilation (NVRTC only).
-        Default: None
-    device_float128 : bool, optional
-        Allow __float128 type in device code (NVRTC only).
-        Default: False
-    frandom_seed : str, optional
-        Set random seed for randomized optimizations (NVRTC only).
-        Default: None
-    ofast_compile : str, optional
-        Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
-        Default: None
-    pch : bool, optional
-        Use default precompiled header (NVRTC only, CUDA 12.8+).
-        Default: False
-    create_pch : str, optional
-        Create precompiled header file (NVRTC only, CUDA 12.8+).
-        Default: None
-    use_pch : str, optional
-        Use specific precompiled header file (NVRTC only, CUDA 12.8+).
-        Default: None
-    pch_dir : str, optional
-        PCH directory location (NVRTC only, CUDA 12.8+).
-        Default: None
-    pch_verbose : bool, optional
-        Verbose PCH output (NVRTC only, CUDA 12.8+).
-        Default: False
-    pch_messages : bool, optional
-        Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
-        Default: False
-    instantiate_templates_in_pch : bool, optional
-        Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
-        Default: False
-    """
-
-    name: str | None = "default_program"
-    arch: str | None = None
-    relocatable_device_code: bool | None = None
-    extensible_whole_program: bool | None = None
-    debug: bool | None = None
-    lineinfo: bool | None = None
-    device_code_optimize: bool | None = None
-    ptxas_options: Union[str, list[str], tuple[str]] | None = None
-    max_register_count: int | None = None
-    ftz: bool | None = None
-    prec_sqrt: bool | None = None
-    prec_div: bool | None = None
-    fma: bool | None = None
-    use_fast_math: bool | None = None
-    extra_device_vectorization: bool | None = None
-    link_time_optimization: bool | None = None
-    gen_opt_lto: bool | None = None
-    define_macro: (
-        Union[str, tuple[str, str], list[Union[str, tuple[str, str]]], tuple[Union[str, tuple[str, str]]]] | None
-    ) = None
-    undefine_macro: Union[str, list[str], tuple[str]] | None = None
-    include_path: Union[str, list[str], tuple[str]] | None = None
-    pre_include: Union[str, list[str], tuple[str]] | None = None
-    no_source_include: bool | None = None
-    std: str | None = None
-    builtin_move_forward: bool | None = None
-    builtin_initializer_list: bool | None = None
-    disable_warnings: bool | None = None
-    restrict: bool | None = None
-    device_as_default_execution_space: bool | None = None
-    device_int128: bool | None = None
-    optimization_info: str | None = None
-    no_display_error_number: bool | None = None
-    diag_error: Union[int, list[int], tuple[int]] | None = None
-    diag_suppress: Union[int, list[int], tuple[int]] | None = None
-    diag_warn: Union[int, list[int], tuple[int]] | None = None
-    brief_diagnostics: bool | None = None
-    time: str | None = None
-    split_compile: int | None = None
-    fdevice_syntax_only: bool | None = None
-    minimal: bool | None = None
-    extra_sources: (
-        Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
-    ) = None
-    no_cache: bool | None = None
-    fdevice_time_trace: str | None = None
-    device_float128: bool | None = None
-    frandom_seed: str | None = None
-    ofast_compile: str | None = None
-    pch: bool | None = None
-    create_pch: str | None = None
-    use_pch: str | None = None
-    pch_dir: str | None = None
-    pch_verbose: bool | None = None
-    pch_messages: bool | None = None
-    instantiate_templates_in_pch: bool | None = None
-    numba_debug: bool | None = None  # Custom option for Numba debugging
-    use_libdevice: bool | None = None # For libdevice execution 
-
-    def __post_init__(self):
-        self._name = self.name.encode()
-        # Set arch to default if not provided
-        if self.arch is None:
-            self.arch = f"sm_{Device().arch}"
-
-    def _prepare_nvrtc_options(self) -> list[bytes]:
-        # Build NVRTC-specific options
-        options = [f"-arch={self.arch}"]
-        if self.relocatable_device_code is not None:
-            options.append(f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            options.append("--extensible-whole-program")
-        if self.debug is not None and self.debug:
-            options.append("--device-debug")
-        if self.lineinfo is not None and self.lineinfo:
-            options.append("--generate-line-info")
-        if self.device_code_optimize is not None and self.device_code_optimize:
-            options.append("--dopt=on")
-        if self.ptxas_options is not None:
-            opt_name = "--ptxas-options"
-            if isinstance(self.ptxas_options, str):
-                options.append(f"{opt_name}={self.ptxas_options}")
-            elif is_sequence(self.ptxas_options):
-                for opt_value in self.ptxas_options:
-                    options.append(f"{opt_name}={opt_value}")
-        if self.max_register_count is not None:
-            options.append(f"--maxrregcount={self.max_register_count}")
-        if self.ftz is not None:
-            options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
-        if self.prec_sqrt is not None:
-            options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
-        if self.prec_div is not None:
-            options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
-        if self.fma is not None:
-            options.append(f"--fmad={_handle_boolean_option(self.fma)}")
-        if self.use_fast_math is not None and self.use_fast_math:
-            options.append("--use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            options.append("--extra-device-vectorization")
-        if self.link_time_optimization is not None and self.link_time_optimization:
-            options.append("--dlink-time-opt")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            options.append("--gen-opt-lto")
-        if self.define_macro is not None:
-            _process_define_macro(options, self.define_macro)
-        if self.undefine_macro is not None:
-            if isinstance(self.undefine_macro, str):
-                options.append(f"--undefine-macro={self.undefine_macro}")
-            elif is_sequence(self.undefine_macro):
-                for macro in self.undefine_macro:
-                    options.append(f"--undefine-macro={macro}")
-        if self.include_path is not None:
-            if isinstance(self.include_path, str):
-                options.append(f"--include-path={self.include_path}")
-            elif is_sequence(self.include_path):
-                for path in self.include_path:
-                    options.append(f"--include-path={path}")
-        if self.pre_include is not None:
-            if isinstance(self.pre_include, str):
-                options.append(f"--pre-include={self.pre_include}")
-            elif is_sequence(self.pre_include):
-                for header in self.pre_include:
-                    options.append(f"--pre-include={header}")
-        if self.no_source_include is not None and self.no_source_include:
-            options.append("--no-source-include")
-        if self.std is not None:
-            options.append(f"--std={self.std}")
-        if self.builtin_move_forward is not None:
-            options.append(f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}")
-        if self.builtin_initializer_list is not None:
-            options.append(f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}")
-        if self.disable_warnings is not None and self.disable_warnings:
-            options.append("--disable-warnings")
-        if self.restrict is not None and self.restrict:
-            options.append("--restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            options.append("--device-as-default-execution-space")
-        if self.device_int128 is not None and self.device_int128:
-            options.append("--device-int128")
-        if self.device_float128 is not None and self.device_float128:
-            options.append("--device-float128")
-        if self.optimization_info is not None:
-            options.append(f"--optimization-info={self.optimization_info}")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            options.append("--no-display-error-number")
-        if self.diag_error is not None:
-            if isinstance(self.diag_error, int):
-                options.append(f"--diag-error={self.diag_error}")
-            elif is_sequence(self.diag_error):
-                for error in self.diag_error:
-                    options.append(f"--diag-error={error}")
-        if self.diag_suppress is not None:
-            if isinstance(self.diag_suppress, int):
-                options.append(f"--diag-suppress={self.diag_suppress}")
-            elif is_sequence(self.diag_suppress):
-                for suppress in self.diag_suppress:
-                    options.append(f"--diag-suppress={suppress}")
-        if self.diag_warn is not None:
-            if isinstance(self.diag_warn, int):
-                options.append(f"--diag-warn={self.diag_warn}")
-            elif is_sequence(self.diag_warn):
-                for warn in self.diag_warn:
-                    options.append(f"--diag-warn={warn}")
-        if self.brief_diagnostics is not None:
-            options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
-        if self.time is not None:
-            options.append(f"--time={self.time}")
-        if self.split_compile is not None:
-            options.append(f"--split-compile={self.split_compile}")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            options.append("--fdevice-syntax-only")
-        if self.minimal is not None and self.minimal:
-            options.append("--minimal")
-        if self.no_cache is not None and self.no_cache:
-            options.append("--no-cache")
-        if self.fdevice_time_trace is not None:
-            options.append(f"--fdevice-time-trace={self.fdevice_time_trace}")
-        if self.frandom_seed is not None:
-            options.append(f"--frandom-seed={self.frandom_seed}")
-        if self.ofast_compile is not None:
-            options.append(f"--Ofast-compile={self.ofast_compile}")
-        # PCH options (CUDA 12.8+)
-        if self.pch is not None and self.pch:
-            options.append("--pch")
-        if self.create_pch is not None:
-            options.append(f"--create-pch={self.create_pch}")
-        if self.use_pch is not None:
-            options.append(f"--use-pch={self.use_pch}")
-        if self.pch_dir is not None:
-            options.append(f"--pch-dir={self.pch_dir}")
-        if self.pch_verbose is not None:
-            options.append(f"--pch-verbose={_handle_boolean_option(self.pch_verbose)}")
-        if self.pch_messages is not None:
-            options.append(f"--pch-messages={_handle_boolean_option(self.pch_messages)}")
-        if self.instantiate_templates_in_pch is not None:
-            options.append(
-                f"--instantiate-templates-in-pch={_handle_boolean_option(self.instantiate_templates_in_pch)}"
-            )
-        if self.numba_debug:
-            options.append("--numba-debug")
-        return [o.encode() for o in options]
-
-    def _prepare_nvvm_options(self, as_bytes: bool = True) -> Union[list[bytes], list[str]]:
-
-        options = []
-
-        # Options supported by NVVM
-        assert self.arch is not None
-        arch = self.arch
-        if arch.startswith("sm_"):
-            arch = f"compute_{arch[3:]}"
-        options.append(f"-arch={arch}")
-        if self.debug is not None and self.debug:
-            options.append("-g")
-        if self.device_code_optimize is False:
-            options.append("-opt=0")
-        elif self.device_code_optimize is True:
-            options.append("-opt=3")
-        # NVVM uses 0/1 instead of true/false for boolean options
-        if self.ftz is not None:
-            options.append(f"-ftz={'1' if self.ftz else '0'}")
-        if self.prec_sqrt is not None:
-            options.append(f"-prec-sqrt={'1' if self.prec_sqrt else '0'}")
-        if self.prec_div is not None:
-            options.append(f"-prec-div={'1' if self.prec_div else '0'}")
-        if self.fma is not None:
-            options.append(f"-fma={'1' if self.fma else '0'}")
-
-        # Check for unsupported options and raise error if they are set
-        unsupported = []
-        if self.relocatable_device_code is not None:
-            unsupported.append("relocatable_device_code")
-        if self.extensible_whole_program is not None and self.extensible_whole_program:
-            unsupported.append("extensible_whole_program")
-        if self.lineinfo is not None and self.lineinfo:
-            unsupported.append("lineinfo")
-        if self.ptxas_options is not None:
-            unsupported.append("ptxas_options")
-        if self.max_register_count is not None:
-            unsupported.append("max_register_count")
-        if self.use_fast_math is not None and self.use_fast_math:
-            unsupported.append("use_fast_math")
-        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
-            unsupported.append("extra_device_vectorization")
-        if self.gen_opt_lto is not None and self.gen_opt_lto:
-            unsupported.append("gen_opt_lto")
-        if self.define_macro is not None:
-            unsupported.append("define_macro")
-        if self.undefine_macro is not None:
-            unsupported.append("undefine_macro")
-        if self.include_path is not None:
-            unsupported.append("include_path")
-        if self.pre_include is not None:
-            unsupported.append("pre_include")
-        if self.no_source_include is not None and self.no_source_include:
-            unsupported.append("no_source_include")
-        if self.std is not None:
-            unsupported.append("std")
-        if self.builtin_move_forward is not None:
-            unsupported.append("builtin_move_forward")
-        if self.builtin_initializer_list is not None:
-            unsupported.append("builtin_initializer_list")
-        if self.disable_warnings is not None and self.disable_warnings:
-            unsupported.append("disable_warnings")
-        if self.restrict is not None and self.restrict:
-            unsupported.append("restrict")
-        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
-            unsupported.append("device_as_default_execution_space")
-        if self.device_int128 is not None and self.device_int128:
-            unsupported.append("device_int128")
-        if self.optimization_info is not None:
-            unsupported.append("optimization_info")
-        if self.no_display_error_number is not None and self.no_display_error_number:
-            unsupported.append("no_display_error_number")
-        if self.diag_error is not None:
-            unsupported.append("diag_error")
-        if self.diag_suppress is not None:
-            unsupported.append("diag_suppress")
-        if self.diag_warn is not None:
-            unsupported.append("diag_warn")
-        if self.brief_diagnostics is not None:
-            unsupported.append("brief_diagnostics")
-        if self.time is not None:
-            unsupported.append("time")
-        if self.split_compile is not None:
-            unsupported.append("split_compile")
-        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
-            unsupported.append("fdevice_syntax_only")
-        if self.minimal is not None and self.minimal:
-            unsupported.append("minimal")
-        if self.numba_debug is not None and self.numba_debug:
-            unsupported.append("numba_debug")
-        if unsupported:
-            raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
-
-        if as_bytes:
-            return [o.encode() for o in options]
-        else:
-            return options
-
-    def as_bytes(self, backend: str) -> list[bytes]:
-        """Convert program options to bytes format for the specified backend.
-
-        This method transforms the program options into a format suitable for the
-        specified compiler backend. Different backends may use different option names
-        and formats even for the same conceptual options.
-
-        Parameters
-        ----------
-        backend : str
-            The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
-
-        Returns
-        -------
-        list[bytes]
-            List of option strings encoded as bytes.
-
-        Raises
-        ------
-        ValueError
-            If an unknown backend is specified.
-        CUDAError
-            If an option incompatible with the specified backend is set.
-
-        Examples
-        --------
-        >>> options = ProgramOptions(arch="sm_80", debug=True)
-        >>> nvrtc_options = options.as_bytes("nvrtc")
-        """
-        backend = backend.lower()
-        if backend == "nvrtc":
-            return self._prepare_nvrtc_options()
-        elif backend == "nvvm":
-            return self._prepare_nvvm_options(as_bytes=True)
-        else:
-            raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
-
-    def __repr__(self):
-        return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
-
-
-ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT]
-
-
-class Program:
-    """Represent a compilation machinery to process programs into
-    :obj:`~_module.ObjectCode`.
-
-    This object provides a unified interface to multiple underlying
-    compiler libraries. Compilation support is enabled for a wide
-    range of code types and compilation types.
-
-    Parameters
-    ----------
-    code : Any
-        String of the CUDA Runtime Compilation program.
-    code_type : Any
-        String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported.
-    options : ProgramOptions, optional
-        A ProgramOptions object to customize the compilation process.
-        See :obj:`ProgramOptions` for more information.
-    """
-
-    class _MembersNeededForFinalize:
-        __slots__ = "handle", "backend"
-
-        def __init__(self, program_obj, handle, backend):
-            self.handle = handle
-            self.backend = backend
-            weakref.finalize(program_obj, self.close)
-
-        def close(self):
-            if self.handle is not None:
-                if self.backend == "NVRTC":
-                    handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
-                elif self.backend == "NVVM":
-                    nvvm = _get_nvvm_module()
-                    nvvm.destroy_program(self.handle)
-                self.handle = None
-
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")
-
-    def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = Program._MembersNeededForFinalize(self, None, None)
-
-        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
-        code_type = code_type.lower()
-        self._module_count = 0
-
-        if code_type == "c++":
-            assert_type(code, str)
-            # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            if options.extra_sources is not None:
-                raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
-
-
-            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
-            self._mnff.backend = "NVRTC"
-            self._backend = "NVRTC"
-            self._linker = None
-
-        elif code_type == "ptx":
-            assert_type(code, str)
-            if options.extra_sources is not None:
-                raise ValueError("extra_sources is not supported by the PTX backend.")
-            self._linker = Linker(
-                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
-            )
-            self._backend = self._linker.backend
-
-        elif code_type == "nvvm":
-            if isinstance(code, str):
-                code = code.encode("utf-8")
-            elif not isinstance(code, (bytes, bytearray)):
-                raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
-
-            nvvm = _get_nvvm_module()
-            self._mnff.handle = nvvm.create_program()
-            self._mnff.backend = "NVVM"
-            nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
-            self._module_count = 1
-            # Add extra modules if provided
-            if options.extra_sources is not None:
-                if not is_sequence(options.extra_sources):
-                    raise TypeError(
-                        "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
-                    )
-                for i, module in enumerate(options.extra_sources):
-                    if not isinstance(module, tuple) or len(module) != 2:
-                        raise TypeError(
-                            f"Each extra module must be a 2-tuple (name, source)"
-                            f", got {type(module).__name__} at index {i}"
-                        )
-
-                    module_name, module_source = module
-
-                    if not isinstance(module_name, str):
-                        raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")
-
-                    if isinstance(module_source, str):
-                        # Textual LLVM IR - encode to UTF-8 bytes
-                        module_source = module_source.encode("utf-8")
-                    elif not isinstance(module_source, (bytes, bytearray)):
-                        raise TypeError(
-                            f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
-                            f"or bytearray, got {type(module_source).__name__}"
-                        )
-
-                    if len(module_source) == 0:
-                        raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
-
-                    nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
-                    self._module_count += 1
-                    
-            self._use_libdevice = options.use_libdevice
-            self._backend = "NVVM"
-            self._linker = None
-
-        else:
-            supported_code_types = ("c++", "ptx", "nvvm")
-            assert code_type not in supported_code_types, f"{code_type=}"
-            raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
-
-    def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
-        return LinkerOptions(
-            name=options.name,
-            arch=options.arch,
-            max_register_count=options.max_register_count,
-            time=options.time,
-            link_time_optimization=options.link_time_optimization,
-            debug=options.debug,
-            lineinfo=options.lineinfo,
-            ftz=options.ftz,
-            prec_div=options.prec_div,
-            prec_sqrt=options.prec_sqrt,
-            fma=options.fma,
-            split_compile=options.split_compile,
-            ptxas_options=options.ptxas_options,
-            no_cache=options.no_cache,
-        )
-
-    def close(self):
-        """Destroy this program."""
-        if self._linker:
-            self._linker.close()
-        self._mnff.close()
-
-    @staticmethod
-    def _can_load_generated_ptx():
-        driver_ver = handle_return(driver.cuDriverGetVersion())
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
-
-    def compile(self, target_type, name_expressions=(), logs=None):
-        """Compile the program with a specific compilation type.
-
-        Parameters
-        ----------
-        target_type : Any
-            String of the targeted compilation type.
-            Supported options are "ptx", "cubin" and "ltoir".
-        name_expressions : Union[list, tuple], optional
-            List of explicit name expressions to become accessible.
-            (Default to no expressions)
-        logs : Any, optional
-            Object with a write method to receive the logs generated
-            from compilation.
-            (Default to no logs)
-
-        Returns
-        -------
-        :obj:`~_module.ObjectCode`
-            Newly created code object.
-
-        """
-        supported_target_types = ("ptx", "cubin", "ltoir")
-        if target_type not in supported_target_types:
-            raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
-
-        if self._backend == "NVRTC":
-            if target_type == "ptx" and not self._can_load_generated_ptx():
-                warn(
-                    "The CUDA driver version is older than the backend version. "
-                    "The generated ptx will not be loadable by the current driver.",
-                    stacklevel=1,
-                    category=RuntimeWarning,
-                )
-            if name_expressions:
-                for n in name_expressions:
-                    handle_return(
-                        nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()),
-                        handle=self._mnff.handle,
-                    )
-            options = self._options.as_bytes("nvrtc")
-            handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
-                handle=self._mnff.handle,
-            )
-
-            size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
-            comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
-            size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
-            data = b" " * size
-            handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
-
-            symbol_mapping = {}
-            if name_expressions:
-                for n in name_expressions:
-                    symbol_mapping[n] = handle_return(
-                        nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
-                    )
-
-            if logs is not None:
-                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
-                if logsize > 1:
-                    log = b" " * logsize
-                    handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
-
-        elif self._backend == "NVVM":
-            if target_type not in ("ptx", "ltoir"):
-                raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
-
-            # TODO: flip to True when NVIDIA/cuda-python#1354 is resolved and CUDA 12 is dropped
-            nvvm_options = self._options._prepare_nvvm_options(as_bytes=False)
-            if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
-                nvvm_options.append("-gen-lto")
-            nvvm = _get_nvvm_module()
-            with _nvvm_exception_manager(self):
-                nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-                # libdevice compilation 
-                if getattr(self, '_use_libdevice', False):
-                    libdevice_path = _find_libdevice_path()
-                    if libdevice_path is None:
-                        raise RuntimeError(
-                            "use_libdevice=True but could not find libdevice.10.bc. "
-                            "Ensure CUDA toolkit is installed."
-                        )
-                    with open(libdevice_path, "rb") as f:
-                        libdevice_bc = f.read()
-                    # libdevice for numba-cuda
-                    nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
-                
-                nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-
-            size = nvvm.get_compiled_result_size(self._mnff.handle)
-            data = bytearray(size)
-            nvvm.get_compiled_result(self._mnff.handle, data)
-
-            if logs is not None:
-                logsize = nvvm.get_program_log_size(self._mnff.handle)
-                if logsize > 1:
-                    log = bytearray(logsize)
-                    nvvm.get_program_log(self._mnff.handle, log)
-                    logs.write(log.decode("utf-8", errors="backslashreplace"))
-
-            return ObjectCode._init(data, target_type, name=self._options.name)
-
-        supported_backends = ("nvJitLink", "driver")
-        if self._backend not in supported_backends:
-            raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
-        return self._linker.link(target_type)
-
-    @property
-    def backend(self) -> str:
-        """Return this Program instance's underlying backend."""
-        return self._backend
-
-    @property
-    def handle(self) -> ProgramHandleT:
-        """Return the underlying handle object.
-
-        .. note::
-
-           The type of the returned object depends on the backend.
-
-        .. caution::
-
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Program.handle)``.
-        """
-        return self._mnff.handle
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
new file mode 100644
index 0000000000..7f8f83d06b
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pyx
@@ -0,0 +1,1038 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Compilation machinery for CUDA programs.
+
+This module provides :class:`Program` for compiling source code into
+:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from warnings import warn
+
+from cuda.bindings import driver, nvrtc
+
+from libcpp.vector cimport vector
+
+from ._resource_handles cimport (
+    as_cu,
+    as_py,
+    create_nvrtc_program_handle,
+    create_nvvm_program_handle,
+)
+from cuda.bindings cimport cynvrtc, cynvvm
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN_NVRTC, HANDLE_RETURN_NVVM
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
+    CUDAError,
+    _handle_boolean_option,
+    check_or_create_options,
+    get_binding_version,
+    handle_return,
+    is_nested_sequence,
+    is_sequence,
+)
+
+__all__ = ["Program", "ProgramOptions"]
+
+ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT
+"""Type alias for program handle types across different backends.
+
+The ``int`` type covers NVVM handles, which don't have a wrapper class.
+"""
+
+
+# =============================================================================
+# Principal Class
+# =============================================================================
+
+
+cdef class Program:
+    """Represent a compilation machinery to process programs into
+    :class:`~cuda.core.ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
+
+    Parameters
+    ----------
+    code : str | bytes | bytearray
+        The source code to compile. For C++ and PTX, must be a string.
+        For NVVM IR, can be str, bytes, or bytearray.
+    code_type : str
+        The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
+    options : :class:`ProgramOptions`, optional
+        Options to customize the compilation process.
+    """
+
+    def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None):
+        Program_init(self, code, code_type, options)
+
+    def close(self):
+        """Destroy this program."""
+        if self._linker:
+            self._linker.close()
+        # Reset handles - the C++ shared_ptr destructor handles cleanup
+        self._h_nvrtc.reset()
+        self._h_nvvm.reset()
+
+    def compile(
+        self, target_type: str, name_expressions: tuple | list = (), logs = None
+    ) -> ObjectCode:
+        """Compile the program to the specified target type.
+
+        Parameters
+        ----------
+        target_type : str
+            The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
+        name_expressions : tuple | list, optional
+            Sequence of name expressions to make accessible in the compiled code.
+            Used for template instantiation and similar cases.
+        logs : object, optional
+            Object with a ``write`` method to receive compilation logs.
+
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The compiled object code.
+        """
+        return Program_compile(self, target_type, name_expressions, logs)
+
+    @property
+    def backend(self) -> str:
+        """Return this Program instance's underlying backend."""
+        return self._backend
+
+    @property
+    def handle(self) -> ProgramHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
+        """
+        if self._backend == "NVRTC":
+            return as_py(self._h_nvrtc)
+        elif self._backend == "NVVM":
+            return as_py(self._h_nvvm)  # returns int (NVVM uses raw integers)
+        else:
+            return self._linker.handle
+
+    def __repr__(self) -> str:
+        return f"<Program backend='{self._backend}'>"
+
+
+# =============================================================================
+# Other Public Classes
+# =============================================================================
+
+
+@dataclass
+class ProgramOptions:
+    """Customizable options for configuring :class:`Program`.
+
+    Attributes
+    ----------
+    name : str, optional
+        Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
+    relocatable_device_code : bool, optional
+        Enable (disable) the generation of relocatable device code.
+        Default: False
+    extensible_whole_program : bool, optional
+        Do extensible whole program compilation of device code.
+        Default: False
+    debug : bool, optional
+        Generate debug information. If --dopt is not specified, then turns off all optimizations.
+        Default: False
+    lineinfo: bool, optional
+        Generate line-number information.
+        Default: False
+    device_code_optimize : bool, optional
+        Enable device code optimization. When specified along with '-G', enables limited debug information generation
+        for optimized device code.
+        Default: None
+    ptxas_options : Union[str, list[str]], optional
+        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
+        For example ["-v", "-O2"].
+        Default: None
+    max_register_count : int, optional
+        Specify the maximum amount of registers that GPU functions can use.
+        Default: None
+    ftz : bool, optional
+        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
+        values.
+        Default: False
+    prec_sqrt : bool, optional
+        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
+        Default: True
+    prec_div : bool, optional
+        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
+        approximation.
+        Default: True
+    fma : bool, optional
+        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
+        multiply-add operations.
+        Default: True
+    use_fast_math : bool, optional
+        Make use of fast math operations.
+        Default: False
+    extra_device_vectorization : bool, optional
+        Enables more aggressive device code vectorization in the NVVM optimizer.
+        Default: False
+    link_time_optimization : bool, optional
+        Generate intermediate code for later link-time optimization.
+        Default: False
+    gen_opt_lto : bool, optional
+        Run the optimizer passes before generating the LTO IR.
+        Default: False
+    define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
+        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
+        strings, in which case the first element is defined as the second, or a list of strings or tuples.
+        Default: None
+    undefine_macro : Union[str, list[str]], optional
+        Cancel any previous definition of a macro, or list of macros.
+        Default: None
+    include_path : Union[str, list[str]], optional
+        Add the directory or directories to the list of directories to be searched for headers.
+        Default: None
+    pre_include : Union[str, list[str]], optional
+        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
+        Default: None
+    no_source_include : bool, optional
+        Disable the default behavior of adding the directory of each input source to the include path.
+        Default: False
+    std : str, optional
+        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
+        Default: c++17
+    builtin_move_forward : bool, optional
+        Provide builtin definitions of std::move and std::forward.
+        Default: True
+    builtin_initializer_list : bool, optional
+        Provide builtin definitions of std::initializer_list class and member functions.
+        Default: True
+    disable_warnings : bool, optional
+        Inhibit all warning messages.
+        Default: False
+    restrict : bool, optional
+        Programmer assertion that all kernel pointer parameters are restrict pointers.
+        Default: False
+    device_as_default_execution_space : bool, optional
+        Treat entities with no execution space annotation as __device__ entities.
+        Default: False
+    device_int128 : bool, optional
+        Allow the __int128 type in device code.
+        Default: False
+    optimization_info : str, optional
+        Provide optimization reports for the specified kind of optimization.
+        Default: None
+    no_display_error_number : bool, optional
+        Disable the display of a diagnostic number for warning messages.
+        Default: False
+    diag_error : Union[int, list[int]], optional
+        Emit error for a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+    diag_suppress : Union[int, list[int]], optional
+        Suppress a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+    diag_warn : Union[int, list[int]], optional
+        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
+        Default: None
+    brief_diagnostics : bool, optional
+        Disable or enable showing source line and column info in a diagnostic.
+        Default: False
+    time : str, optional
+        Generate a CSV table with the time taken by each compilation phase.
+        Default: None
+    split_compile : int, optional
+        Perform compiler optimizations in parallel.
+        Default: 1
+    fdevice_syntax_only : bool, optional
+        Ends device compilation after front-end syntax checking.
+        Default: False
+    minimal : bool, optional
+        Omit certain language features to reduce compile time for small programs.
+        Default: False
+    no_cache : bool, optional
+        Disable compiler caching.
+        Default: False
+    fdevice_time_trace : str, optional
+        Generate time trace JSON for profiling compilation (NVRTC only).
+        Default: None
+    device_float128 : bool, optional
+        Allow __float128 type in device code (NVRTC only).
+        Default: False
+    frandom_seed : str, optional
+        Set random seed for randomized optimizations (NVRTC only).
+        Default: None
+    ofast_compile : str, optional
+        Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
+        Default: None
+    pch : bool, optional
+        Use default precompiled header (NVRTC only, CUDA 12.8+).
+        Default: False
+    create_pch : str, optional
+        Create precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    use_pch : str, optional
+        Use specific precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_dir : str, optional
+        PCH directory location (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_verbose : bool, optional
+        Verbose PCH output (NVRTC only, CUDA 12.8+).
+        Default: False
+    pch_messages : bool, optional
+        Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
+        Default: False
+    instantiate_templates_in_pch : bool, optional
+        Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
+        Default: False
+    """
+
+    name: str | None = "default_program"
+    arch: str | None = None
+    relocatable_device_code: bool | None = None
+    extensible_whole_program: bool | None = None
+    debug: bool | None = None
+    lineinfo: bool | None = None
+    device_code_optimize: bool | None = None
+    ptxas_options: str | list[str] | tuple[str] | None = None
+    max_register_count: int | None = None
+    ftz: bool | None = None
+    prec_sqrt: bool | None = None
+    prec_div: bool | None = None
+    fma: bool | None = None
+    use_fast_math: bool | None = None
+    extra_device_vectorization: bool | None = None
+    link_time_optimization: bool | None = None
+    gen_opt_lto: bool | None = None
+    define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None
+    undefine_macro: str | list[str] | tuple[str] | None = None
+    include_path: str | list[str] | tuple[str] | None = None
+    pre_include: str | list[str] | tuple[str] | None = None
+    no_source_include: bool | None = None
+    std: str | None = None
+    builtin_move_forward: bool | None = None
+    builtin_initializer_list: bool | None = None
+    disable_warnings: bool | None = None
+    restrict: bool | None = None
+    device_as_default_execution_space: bool | None = None
+    device_int128: bool | None = None
+    optimization_info: str | None = None
+    no_display_error_number: bool | None = None
+    diag_error: int | list[int] | tuple[int] | None = None
+    diag_suppress: int | list[int] | tuple[int] | None = None
+    diag_warn: int | list[int] | tuple[int] | None = None
+    brief_diagnostics: bool | None = None
+    time: str | None = None
+    split_compile: int | None = None
+    fdevice_syntax_only: bool | None = None
+    minimal: bool | None = None
+    no_cache: bool | None = None
+    fdevice_time_trace: str | None = None
+    device_float128: bool | None = None
+    frandom_seed: str | None = None
+    ofast_compile: str | None = None
+    pch: bool | None = None
+    create_pch: str | None = None
+    use_pch: str | None = None
+    pch_dir: str | None = None
+    pch_verbose: bool | None = None
+    pch_messages: bool | None = None
+    instantiate_templates_in_pch: bool | None = None
+    extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
+    use_libdevice: bool | None = None  # For libdevice execution
+    numba_debug: bool | None = None  # Custom option for Numba debugging
+
+    def __post_init__(self):
+        self._name = self.name.encode()
+        # Set arch to default if not provided
+        if self.arch is None:
+            self.arch = f"sm_{Device().arch}"
+
+    def _prepare_nvrtc_options(self) -> list[bytes]:
+        return _prepare_nvrtc_options_impl(self)
+
+    def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]:
+        return _prepare_nvvm_options_impl(self, as_bytes)
+
+    def as_bytes(self, backend: str, target_type: str | None = None) -> list[bytes]:
+        """Convert program options to bytes format for the specified backend.
+
+        This method transforms the program options into a format suitable for the
+        specified compiler backend. Different backends may use different option names
+        and formats even for the same conceptual options.
+
+        Parameters
+        ----------
+        backend : str
+            The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
+        target_type : str, optional
+            The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends
+            require additional options based on the target type.
+
+        Returns
+        -------
+        list[bytes]
+            List of option strings encoded as bytes.
+
+        Raises
+        ------
+        ValueError
+            If an unknown backend is specified.
+        CUDAError
+            If an option incompatible with the specified backend is set.
+
+        Examples
+        --------
+        >>> options = ProgramOptions(arch="sm_80", debug=True)
+        >>> nvrtc_options = options.as_bytes("nvrtc")
+        """
+        backend = backend.lower()
+        if backend == "nvrtc":
+            return self._prepare_nvrtc_options()
+        elif backend == "nvvm":
+            options = self._prepare_nvvm_options(as_bytes=True)
+            if target_type == "ltoir" and b"-gen-lto" not in options:
+                options.append(b"-gen-lto")
+            return options
+        else:
+            raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
+
+    def __repr__(self):
+        return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
+
+
+# =============================================================================
+# Private Classes and Helper Functions
+# =============================================================================
+
+# Module-level state for NVVM lazy loading
+cdef object_nvvm_module = None
+cdef bint _nvvm_import_attempted = False
+
+
+def _get_nvvm_module():
+    """Get the NVVM module, importing it lazily with availability checks."""
+    global _nvvm_module, _nvvm_import_attempted
+
+    if _nvvm_import_attempted:
+        if _nvvm_module is None:
+            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
+        return _nvvm_module
+
+    _nvvm_import_attempted = True
+
+    try:
+        version = get_binding_version()
+        if version < (12, 9):
+            raise RuntimeError(
+                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
+                "Please update cuda-bindings to use NVVM features."
+            )
+
+        from cuda.bindings import nvvm
+        from cuda.bindings._internal.nvvm import _inspect_function_pointer
+
+        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
+            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
+
+        _nvvm_module = nvvm
+        return _nvvm_module
+
+    except RuntimeError as e:
+        _nvvm_module = None
+        raise e
+
+def _find_libdevice_path():
+    """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
+    from cuda.pathfinder import get_libdevice_path
+    return get_libdevice_path()
+
+
+cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
+    """Process a single define macro, returning True if successful."""
+    if isinstance(macro, str):
+        options.append(f"--define-macro={macro}")
+        return True
+    if isinstance(macro, tuple):
+        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
+            raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
+        options.append(f"--define-macro={macro[0]}={macro[1]}")
+        return True
+    return False
+
+
+cdef inline void _process_define_macro(list options, object macro) except *:
+    """Process define_macro option which can be str, tuple, or list thereof."""
+    union_type = "Union[str, tuple[str, str]]"
+    if _process_define_macro_inner(options, macro):
+        return
+    if is_nested_sequence(macro):
+        for seq_macro in macro:
+            if not _process_define_macro_inner(options, seq_macro):
+                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
+        return
+    raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
+
+
+cpdef bint _can_load_generated_ptx() except? -1:
+    """Check if the driver can load PTX generated by the current NVRTC version."""
+    driver_ver = handle_return(driver.cuDriverGetVersion())
+    nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+    return nvrtc_major * 1000 + nvrtc_minor * 10 <= driver_ver
+
+
+cdef inline object _translate_program_options(object options):
+    """Translate ProgramOptions to LinkerOptions for PTX compilation."""
+    return LinkerOptions(
+        name=options.name,
+        arch=options.arch,
+        max_register_count=options.max_register_count,
+        time=options.time,
+        link_time_optimization=options.link_time_optimization,
+        debug=options.debug,
+        lineinfo=options.lineinfo,
+        ftz=options.ftz,
+        prec_div=options.prec_div,
+        prec_sqrt=options.prec_sqrt,
+        fma=options.fma,
+        split_compile=options.split_compile,
+        ptxas_options=options.ptxas_options,
+        no_cache=options.no_cache,
+    )
+
+
+cdef inline int Program_init(Program self, object code, str code_type, object options) except -1:
+    """Initialize a Program instance."""
+    cdef cynvrtc.nvrtcProgram nvrtc_prog
+    cdef cynvvm.nvvmProgram nvvm_prog
+    cdef bytes code_bytes
+    cdef const char* code_ptr
+    cdef const char* name_ptr
+    cdef size_t code_len
+    cdef bytes module_bytes
+    cdef const char* module_ptr
+    cdef size_t module_len
+
+    self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+    code_type = code_type.lower()
+    self._module_count = 0
+    self._use_libdevice = False
+
+    if code_type == "c++":
+        assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
+        # TODO: support pre-loaded headers & include names
+        code_bytes = code.encode()
+        code_ptr = <const char*>code_bytes
+        name_ptr = <const char*>options._name
+
+        with nogil:
+            HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
+                &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
+        self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
+        self._backend = "NVRTC"
+        self._linker = None
+
+    elif code_type == "ptx":
+        assert_type(code, str)
+        if options.extra_sources is not None:
+            raise ValueError("extra_sources is not supported by the PTX backend.")
+        self._linker = Linker(
+            ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
+        )
+        self._backend = self._linker.backend
+
+    elif code_type == "nvvm":
+        _get_nvvm_module()  # Validate NVVM availability
+        if isinstance(code, str):
+            code = code.encode("utf-8")
+        elif not isinstance(code, (bytes, bytearray)):
+            raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+
+        code_ptr = <const char*>(<bytes>code)
+        name_ptr = <const char*>options._name
+        code_len = len(code)
+
+        with nogil:
+            HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
+        self._h_nvvm = create_nvvm_program_handle(nvvm_prog)  # RAII from here
+        with nogil:
+            HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr))
+        self._module_count = 1
+
+        # Add extra modules if provided
+        if options.extra_sources is not None:
+            if not is_sequence(options.extra_sources):
+                raise TypeError(
+                    "extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)"
+                )
+            for i, module in enumerate(options.extra_sources):
+                if not isinstance(module, tuple) or len(module) != 2:
+                    raise TypeError(
+                        f"Each extra module must be a 2-tuple (name, source)"
+                        f", got {type(module).__name__} at index {i}"
+                    )
+
+                module_name, module_source = module
+
+                if not isinstance(module_name, str):
+                    raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+
+                if isinstance(module_source, str):
+                    # Textual LLVM IR - encode to UTF-8 bytes
+                    module_source = module_source.encode("utf-8")
+                elif not isinstance(module_source, (bytes, bytearray)):
+                    raise TypeError(
+                        f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                        f"or bytearray, got {type(module_source).__name__}"
+                    )
+
+                if len(module_source) == 0:
+                    raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+
+                # Add the module using NVVM API
+                module_bytes = module_source if isinstance(module_source, bytes) else bytes(module_source)
+                module_ptr = <const char*>module_bytes
+                module_len = len(module_bytes)
+                module_name_bytes = module_name.encode()
+                module_name_ptr = <const char*>module_name_bytes
+
+                with nogil:
+                    HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(
+                        nvvm_prog, module_ptr, module_len, module_name_ptr))
+                self._module_count += 1
+
+        # Store use_libdevice flag
+        if options.use_libdevice:
+            self._use_libdevice = True
+        
+        self._backend = "NVVM"
+        self._linker = None
+
+    else:
+        supported_code_types = ("c++", "ptx", "nvvm")
+        assert code_type not in supported_code_types, f"{code_type=}"
+        raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
+
+    return 0
+
+
+
+cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
+    """Compile using NVRTC backend and return ObjectCode."""
+    cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
+    cdef size_t output_size = 0
+    cdef size_t logsize = 0
+    cdef vector[const char*] options_vec
+    cdef char* data_ptr = NULL
+    cdef bytes name_bytes
+    cdef const char* name_ptr = NULL
+    cdef const char* lowered_name = NULL
+    cdef dict symbol_mapping = {}
+
+    # Add name expressions before compilation
+    if name_expressions:
+        for n in name_expressions:
+            name_bytes = n.encode() if isinstance(n, str) else n
+            name_ptr = <const char*>name_bytes
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))
+
+    # Build options array
+    options_list = self._options.as_bytes("nvrtc", target_type)
+    options_vec.resize(len(options_list))
+    for i in range(len(options_list)):
+        options_vec[i] = <const char*>(<bytes>options_list[i])
+
+    # Compile
+    with nogil:
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    # Get compiled output based on target type
+    if target_type == "ptx":
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTXSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTX(prog, data_ptr))
+    elif target_type == "cubin":
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBINSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBIN(prog, data_ptr))
+    else:  # ltoir
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIRSize(prog, &output_size))
+        data = bytearray(output_size)
+        data_ptr = <char*>(<bytearray>data)
+        with nogil:
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIR(prog, data_ptr))
+
+    # Get lowered names after compilation
+    if name_expressions:
+        for n in name_expressions:
+            name_bytes = n.encode() if isinstance(n, str) else n
+            name_ptr = <const char*>name_bytes
+            HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLoweredName(prog, name_ptr, &lowered_name))
+            symbol_mapping[n] = lowered_name if lowered_name != NULL else None
+
+    # Get compilation log if requested
+    if logs is not None:
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLogSize(prog, &logsize))
+        if logsize > 1:
+            log = bytearray(logsize)
+            data_ptr = <char*>(<bytearray>log)
+            with nogil:
+                HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+
+
+cdef object Program_compile_nvvm(Program self, str target_type, object logs):
+    """Compile using NVVM backend and return ObjectCode."""
+    cdef cynvvm.nvvmProgram prog = as_cu(self._h_nvvm)
+    cdef size_t output_size = 0
+    cdef size_t logsize = 0
+    cdef vector[const char*] options_vec
+    cdef char* data_ptr = NULL
+    cdef bytes libdevice_bytes
+    cdef const char* libdevice_ptr
+    cdef size_t libdevice_len
+
+    options_list = self._options.as_bytes("nvvm", target_type)
+    options_vec.resize(len(options_list))
+    for i in range(len(options_list)):
+        options_vec[i] = <const char*>(<bytes>options_list[i])
+
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    # Load libdevice if requested  - following numba-cuda
+    if self._use_libdevice:
+        libdevice_path = _find_libdevice_path()
+        if libdevice_path is None:
+            raise RuntimeError(
+                "use_libdevice=True but could not find libdevice.10.bc. "
+                "Ensure CUDA toolkit is installed."
+            )
+        with open(libdevice_path, "rb") as f:
+            libdevice_bytes = f.read()
+        libdevice_ptr = <const char*>libdevice_bytes
+        libdevice_len = len(libdevice_bytes)
+        # Use lazy_add_module
+        with nogil:
+            HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram(
+                prog, libdevice_ptr, libdevice_len, NULL))
+
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
+
+    HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size))
+    data = bytearray(output_size)
+    data_ptr = <char*>(<bytearray>data)
+    with nogil:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResult(prog, data_ptr))
+
+    if logs is not None:
+        HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLogSize(prog, &logsize))
+        if logsize > 1:
+            log = bytearray(logsize)
+            data_ptr = <char*>(<bytearray>log)
+            with nogil:
+                HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLog(prog, data_ptr))
+            logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+    return ObjectCode._init(bytes(data), target_type, name=self._options.name)
+
+# Supported target types per backend
+cdef dict SUPPORTED_TARGETS = {
+    "NVRTC": ("ptx", "cubin", "ltoir"),
+    "NVVM": ("ptx", "ltoir"),
+    "nvJitLink": ("cubin", "ptx"),
+    "driver": ("cubin", "ptx"),
+}
+
+
+cdef object Program_compile(Program self, str target_type, object name_expressions, object logs):
+    """Compile the program to the specified target type."""
+    # Validate target_type for this backend
+    supported = SUPPORTED_TARGETS.get(self._backend)
+    if supported is None:
+        raise ValueError(f'Unknown backend="{self._backend}"')
+    if target_type not in supported:
+        raise ValueError(
+            f'Unsupported target_type="{target_type}" for {self._backend} '
+            f'(supported: {", ".join(repr(t) for t in supported)})'
+        )
+
+    if self._backend == "NVRTC":
+        if target_type == "ptx" and not _can_load_generated_ptx():
+            warn(
+                "The CUDA driver version is older than the backend version. "
+                "The generated ptx will not be loadable by the current driver.",
+                stacklevel=2,
+                category=RuntimeWarning,
+            )
+        return Program_compile_nvrtc(self, target_type, name_expressions, logs)
+
+    elif self._backend == "NVVM":
+        return Program_compile_nvvm(self, target_type, logs)
+
+    else:
+        return self._linker.link(target_type)
+
+
+cdef inline list _prepare_nvrtc_options_impl(object opts):
+    """Build NVRTC-specific compiler options."""
+    options = [f"-arch={opts.arch}"]
+    if opts.relocatable_device_code is not None:
+        options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        options.append("--extensible-whole-program")
+    if opts.debug is not None and opts.debug:
+        options.append("--device-debug")
+    if opts.lineinfo is not None and opts.lineinfo:
+        options.append("--generate-line-info")
+    if opts.device_code_optimize is not None and opts.device_code_optimize:
+        options.append("--dopt=on")
+    if opts.ptxas_options is not None:
+        opt_name = "--ptxas-options"
+        if isinstance(opts.ptxas_options, str):
+            options.append(f"{opt_name}={opts.ptxas_options}")
+        elif is_sequence(opts.ptxas_options):
+            for opt_value in opts.ptxas_options:
+                options.append(f"{opt_name}={opt_value}")
+    if opts.max_register_count is not None:
+        options.append(f"--maxrregcount={opts.max_register_count}")
+    if opts.ftz is not None:
+        options.append(f"--ftz={_handle_boolean_option(opts.ftz)}")
+    if opts.prec_sqrt is not None:
+        options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}")
+    if opts.prec_div is not None:
+        options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}")
+    if opts.fma is not None:
+        options.append(f"--fmad={_handle_boolean_option(opts.fma)}")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        options.append("--use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        options.append("--extra-device-vectorization")
+    if opts.link_time_optimization is not None and opts.link_time_optimization:
+        options.append("--dlink-time-opt")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        options.append("--gen-opt-lto")
+    if opts.define_macro is not None:
+        _process_define_macro(options, opts.define_macro)
+    if opts.undefine_macro is not None:
+        if isinstance(opts.undefine_macro, str):
+            options.append(f"--undefine-macro={opts.undefine_macro}")
+        elif is_sequence(opts.undefine_macro):
+            for macro in opts.undefine_macro:
+                options.append(f"--undefine-macro={macro}")
+    if opts.include_path is not None:
+        if isinstance(opts.include_path, str):
+            options.append(f"--include-path={opts.include_path}")
+        elif is_sequence(opts.include_path):
+            for path in opts.include_path:
+                options.append(f"--include-path={path}")
+    if opts.pre_include is not None:
+        if isinstance(opts.pre_include, str):
+            options.append(f"--pre-include={opts.pre_include}")
+        elif is_sequence(opts.pre_include):
+            for header in opts.pre_include:
+                options.append(f"--pre-include={header}")
+    if opts.no_source_include is not None and opts.no_source_include:
+        options.append("--no-source-include")
+    if opts.std is not None:
+        options.append(f"--std={opts.std}")
+    if opts.builtin_move_forward is not None:
+        options.append(f"--builtin-move-forward={_handle_boolean_option(opts.builtin_move_forward)}")
+    if opts.builtin_initializer_list is not None:
+        options.append(f"--builtin-initializer-list={_handle_boolean_option(opts.builtin_initializer_list)}")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        options.append("--disable-warnings")
+    if opts.restrict is not None and opts.restrict:
+        options.append("--restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        options.append("--device-as-default-execution-space")
+    if opts.device_int128 is not None and opts.device_int128:
+        options.append("--device-int128")
+    if opts.device_float128 is not None and opts.device_float128:
+        options.append("--device-float128")
+    if opts.optimization_info is not None:
+        options.append(f"--optimization-info={opts.optimization_info}")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        options.append("--no-display-error-number")
+    if opts.diag_error is not None:
+        if isinstance(opts.diag_error, int):
+            options.append(f"--diag-error={opts.diag_error}")
+        elif is_sequence(opts.diag_error):
+            for error in opts.diag_error:
+                options.append(f"--diag-error={error}")
+    if opts.diag_suppress is not None:
+        if isinstance(opts.diag_suppress, int):
+            options.append(f"--diag-suppress={opts.diag_suppress}")
+        elif is_sequence(opts.diag_suppress):
+            for suppress in opts.diag_suppress:
+                options.append(f"--diag-suppress={suppress}")
+    if opts.diag_warn is not None:
+        if isinstance(opts.diag_warn, int):
+            options.append(f"--diag-warn={opts.diag_warn}")
+        elif is_sequence(opts.diag_warn):
+            for w in opts.diag_warn:
+                options.append(f"--diag-warn={w}")
+    if opts.brief_diagnostics is not None:
+        options.append(f"--brief-diagnostics={_handle_boolean_option(opts.brief_diagnostics)}")
+    if opts.time is not None:
+        options.append(f"--time={opts.time}")
+    if opts.split_compile is not None:
+        options.append(f"--split-compile={opts.split_compile}")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        options.append("--fdevice-syntax-only")
+    if opts.minimal is not None and opts.minimal:
+        options.append("--minimal")
+    if opts.no_cache is not None and opts.no_cache:
+        options.append("--no-cache")
+    if opts.fdevice_time_trace is not None:
+        options.append(f"--fdevice-time-trace={opts.fdevice_time_trace}")
+    if opts.frandom_seed is not None:
+        options.append(f"--frandom-seed={opts.frandom_seed}")
+    if opts.ofast_compile is not None:
+        options.append(f"--Ofast-compile={opts.ofast_compile}")
+    # PCH options (CUDA 12.8+)
+    if opts.pch is not None and opts.pch:
+        options.append("--pch")
+    if opts.create_pch is not None:
+        options.append(f"--create-pch={opts.create_pch}")
+    if opts.use_pch is not None:
+        options.append(f"--use-pch={opts.use_pch}")
+    if opts.pch_dir is not None:
+        options.append(f"--pch-dir={opts.pch_dir}")
+    if opts.pch_verbose is not None:
+        options.append(f"--pch-verbose={_handle_boolean_option(opts.pch_verbose)}")
+    if opts.pch_messages is not None:
+        options.append(f"--pch-messages={_handle_boolean_option(opts.pch_messages)}")
+    if opts.instantiate_templates_in_pch is not None:
+        options.append(
+            f"--instantiate-templates-in-pch={_handle_boolean_option(opts.instantiate_templates_in_pch)}"
+        )
+    if opts.numba_debug:
+        options.append("--numba-debug")
+    return [o.encode() for o in options]
+
+
+cdef inline object _prepare_nvvm_options_impl(object opts, bint as_bytes):
+    """Build NVVM-specific compiler options."""
+    options = []
+
+    # Options supported by NVVM
+    assert opts.arch is not None
+    arch = opts.arch
+    if arch.startswith("sm_"):
+        arch = f"compute_{arch[3:]}"
+    options.append(f"-arch={arch}")
+    if opts.debug is not None and opts.debug:
+        options.append("-g")
+    if opts.device_code_optimize is False:
+        options.append("-opt=0")
+    elif opts.device_code_optimize is True:
+        options.append("-opt=3")
+    # NVVM uses 0/1 instead of true/false for boolean options
+    if opts.ftz is not None:
+        options.append(f"-ftz={'1' if opts.ftz else '0'}")
+    if opts.prec_sqrt is not None:
+        options.append(f"-prec-sqrt={'1' if opts.prec_sqrt else '0'}")
+    if opts.prec_div is not None:
+        options.append(f"-prec-div={'1' if opts.prec_div else '0'}")
+    if opts.fma is not None:
+        options.append(f"-fma={'1' if opts.fma else '0'}")
+
+    # Check for unsupported options and raise error if they are set
+    unsupported = []
+    if opts.relocatable_device_code is not None:
+        unsupported.append("relocatable_device_code")
+    if opts.extensible_whole_program is not None and opts.extensible_whole_program:
+        unsupported.append("extensible_whole_program")
+    if opts.lineinfo is not None and opts.lineinfo:
+        unsupported.append("lineinfo")
+    if opts.ptxas_options is not None:
+        unsupported.append("ptxas_options")
+    if opts.max_register_count is not None:
+        unsupported.append("max_register_count")
+    if opts.use_fast_math is not None and opts.use_fast_math:
+        unsupported.append("use_fast_math")
+    if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
+        unsupported.append("extra_device_vectorization")
+    if opts.gen_opt_lto is not None and opts.gen_opt_lto:
+        unsupported.append("gen_opt_lto")
+    if opts.define_macro is not None:
+        unsupported.append("define_macro")
+    if opts.undefine_macro is not None:
+        unsupported.append("undefine_macro")
+    if opts.include_path is not None:
+        unsupported.append("include_path")
+    if opts.pre_include is not None:
+        unsupported.append("pre_include")
+    if opts.no_source_include is not None and opts.no_source_include:
+        unsupported.append("no_source_include")
+    if opts.std is not None:
+        unsupported.append("std")
+    if opts.builtin_move_forward is not None:
+        unsupported.append("builtin_move_forward")
+    if opts.builtin_initializer_list is not None:
+        unsupported.append("builtin_initializer_list")
+    if opts.disable_warnings is not None and opts.disable_warnings:
+        unsupported.append("disable_warnings")
+    if opts.restrict is not None and opts.restrict:
+        unsupported.append("restrict")
+    if opts.device_as_default_execution_space is not None and opts.device_as_default_execution_space:
+        unsupported.append("device_as_default_execution_space")
+    if opts.device_int128 is not None and opts.device_int128:
+        unsupported.append("device_int128")
+    if opts.optimization_info is not None:
+        unsupported.append("optimization_info")
+    if opts.no_display_error_number is not None and opts.no_display_error_number:
+        unsupported.append("no_display_error_number")
+    if opts.diag_error is not None:
+        unsupported.append("diag_error")
+    if opts.diag_suppress is not None:
+        unsupported.append("diag_suppress")
+    if opts.diag_warn is not None:
+        unsupported.append("diag_warn")
+    if opts.brief_diagnostics is not None:
+        unsupported.append("brief_diagnostics")
+    if opts.time is not None:
+        unsupported.append("time")
+    if opts.split_compile is not None:
+        unsupported.append("split_compile")
+    if opts.fdevice_syntax_only is not None and opts.fdevice_syntax_only:
+        unsupported.append("fdevice_syntax_only")
+    if opts.minimal is not None and opts.minimal:
+        unsupported.append("minimal")
+    if opts.numba_debug is not None and opts.numba_debug:
+        unsupported.append("numba_debug")
+    if unsupported:
+        raise CUDAError(f"The following options are not supported by NVVM backend: {', '.join(unsupported)}")
+
+    if as_bytes:
+        return [o.encode() for o in options]
+    else:
+        return options
\ No newline at end of file

From ca32d2b39dd7a3363dae40362ae9c3933c477e3f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 11 Feb 2026 19:54:54 +0000
Subject: [PATCH 32/79] fix import

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 44d69a9bd4..929e38fd7a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -530,7 +530,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
 @nvvm_available
 def test_nvvm_program_with_single_extra_source(nvvm_ir):
     """Test NVVM program with a single extra source"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -568,7 +568,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 @nvvm_available
 def test_nvvm_program_with_multiple_extra_sources():
     """Test NVVM program with multiple extra sources"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()

From fac190726a88835239b3097aeca7879f8bac1fce Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:16:39 +0000
Subject: [PATCH 33/79] fix tests

---
 cuda_core/tests/test_program.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 929e38fd7a..e439353aa9 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -11,6 +11,7 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()

From 4a01e068b79f3834a72934eb149c80025edd9341 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:39:22 +0000
Subject: [PATCH 34/79] fix ruff check

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e439353aa9..daef53da4e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -11,7 +11,7 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
-from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -650,7 +650,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-def test_bitcode_format(minimal_nvvmir):
+def test_bitcode_format(minimal_nvvmir):  # noqa: F811
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 

From 2d5252fb49bf6bb1db129b030b2f48ab400a6537 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:44:28 +0000
Subject: [PATCH 35/79] ruff fix find_libdevice

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 0cfcd4e493..e56fd17114 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -4,10 +4,10 @@
 import glob
 import os
 
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
-from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
@@ -21,6 +21,7 @@
 else:
     bases = ["/usr/local/cuda", "/opt/cuda"]
 
+
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
     if os.path.isdir(dir_path):

From 2976c24cad77e54091af010d12a8eae32d2fbcd2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 04:46:35 +0000
Subject: [PATCH 36/79] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/_program.pxd | 2 +-
 cuda_core/cuda/core/_program.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
index d4abe85ff8..d2ddc71513 100644
--- a/cuda_core/cuda/core/_program.pxd
+++ b/cuda_core/cuda/core/_program.pxd
@@ -14,4 +14,4 @@ cdef class Program:
         object _options  # ProgramOptions
         object __weakref__
         bint _use_libdevice      # Flag for libdevice loading
-        int _module_count  
+        int _module_count
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 35bf55e545..4f0e0c8b78 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -625,7 +625,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
         # Store use_libdevice flag
         if options.use_libdevice:
             self._use_libdevice = True
-        
+
         self._backend = "NVVM"
         self._linker = None
 

From 61c1e00f5d43b654305304ece7e38026e5d80bb7 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 04:48:32 +0000
Subject: [PATCH 37/79] add spdx and copyright

---
 .../cuda_python_test_helpers/nvvm_bitcode.py                  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index e42dbff085..5264b947d0 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import binascii
 
 import pytest

From c6bea0c38d0adce012fc153f96093b36e3a89ec7 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 11:44:16 +0000
Subject: [PATCH 38/79] rm redundant include and fix test

---
 cuda_pathfinder/cuda/pathfinder/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index ece060d81a..bdcbdea8f1 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,9 +19,7 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-from cuda.pathfinder._static_libs.find_libdevice import (
-    LibdeviceNotFoundError as LibdeviceNotFoundError,
-)
+
 from cuda.pathfinder._static_libs.find_libdevice import (
     find_libdevice as find_libdevice,
 )

From b7866cf5e38d4e5b4ecd1c3a26cbc744a6ac9e0c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 11:46:05 +0000
Subject: [PATCH 39/79] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index bdcbdea8f1..63781965f3 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,7 +19,6 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-
 from cuda.pathfinder._static_libs.find_libdevice import (
     find_libdevice as find_libdevice,
 )

From 42832306f638a116c0b0c9182e6cf7898fdc969e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 18:58:45 +0000
Subject: [PATCH 40/79] refresh tests

---
 cuda_core/tests/test_program.py | 60 ++++++++++++---------------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index daef53da4e..30cdad6b9e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -445,11 +445,23 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
 
 
 @nvvm_available
+@pytest.mark.parametrize("target_type", ["ptx", "ltoir"])
 @pytest.mark.parametrize(
     "options",
     [
         ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
         ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
+        ProgramOptions(name="test3", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
+            name="test4",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+        ),
         pytest.param(
             ProgramOptions(name="test_sm110_1", arch="sm_110", device_code_optimize=False),
             marks=pytest.mark.skipif(
@@ -481,50 +493,22 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
         ),
     ],
 )
-def test_nvvm_program_options(init_cuda, nvvm_ir, options):
-    """Test NVVM programs with different options"""
+def test_nvvm_program_options(init_cuda, nvvm_ir, options, target_type):
+    """Test NVVM programs with different options and target types (ptx/ltoir)"""
     program = Program(nvvm_ir, "nvvm", options)
     assert program.backend == "NVVM"
 
-    ptx_code = program.compile("ptx")
-    assert isinstance(ptx_code, ObjectCode)
-    assert ptx_code.name == options.name
-
-    code_content = ptx_code.code
-    ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
-    assert ".visible .entry simple(" in ptx_text
+    result = program.compile(target_type)
+    assert isinstance(result, ObjectCode)
+    assert result.name == options.name
 
-    program.close()
+    code_content = result.code
+    assert len(code_content) > 0
 
+    if target_type == "ptx":
+        ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
+        assert ".visible .entry simple(" in ptx_text
 
-@nvvm_available
-@pytest.mark.parametrize(
-    "options",
-    [
-        ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
-        ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
-        ProgramOptions(
-            name="ltoir_test3",
-            arch="sm_90",
-            ftz=True,
-            prec_sqrt=False,
-            prec_div=False,
-            fma=True,
-            device_code_optimize=True,
-            link_time_optimization=True,
-        ),
-    ],
-)
-def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
-    """Test NVVM programs for LTOIR with different options"""
-    program = Program(nvvm_ir, "nvvm", options)
-    assert program.backend == "NVVM"
-
-    ltoir_code = program.compile("ltoir")
-    assert isinstance(ltoir_code, ObjectCode)
-    assert ltoir_code.name == options.name
-    code_content = ltoir_code.code
-    assert len(code_content) > 0
     program.close()
 
 

From 78f43283e2d795d498b5aa54d93c15d3eadab753 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 12 Feb 2026 19:05:29 +0000
Subject: [PATCH 41/79] add correct libdevice for CTK> 13

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index e56fd17114..c7e3b8cf18 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -11,7 +11,7 @@
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cuda_nvvm/nvvm/libdevice",  # CTK 13+
+    "nvidia/cu13/nvvm/libdevice",       # CTK 13+
     "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 

From ddf48396621e10506a0cf128bb0348aed1caf73a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 19:14:03 +0000
Subject: [PATCH 42/79] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index c7e3b8cf18..0a37b31736 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -11,7 +11,7 @@
 
 # Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
 SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+    "nvidia/cu13/nvvm/libdevice",  # CTK 13+
     "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
 )
 

From cd3644e68345c75efde93f1335333739e2886a7e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:24:53 +0000
Subject: [PATCH 43/79] revamp design of pathfinder as LocatedHeaderDir

---
 cuda_pathfinder/cuda/pathfinder/__init__.py   |   8 +-
 .../pathfinder/_static_libs/find_libdevice.py | 111 +++++++++++-------
 cuda_pathfinder/tests/test_find_libdevice.py  |  55 ++++++---
 3 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 63781965f3..cc1eda8d2d 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -20,10 +20,10 @@
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
 from cuda.pathfinder._static_libs.find_libdevice import (
-    find_libdevice as find_libdevice,
-)
-from cuda.pathfinder._static_libs.find_libdevice import (
-    get_libdevice_path as get_libdevice_path,
+    BitcodeLibNotFoundError as BitcodeLibNotFoundError,
+    LocatedBitcodeLib as LocatedBitcodeLib,
+    find_bitcode_lib as find_bitcode_lib,
+    locate_bitcode_lib as locate_bitcode_lib,
 )
 
 from cuda.pathfinder._version import __version__  # isort: skip  # noqa: F401
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 0a37b31736..f4422a5df1 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -1,25 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+
 import functools
 import glob
 import os
+from dataclasses import dataclass
 
-from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
-# Site-package paths for libdevice (following SITE_PACKAGES_LIBDIRS pattern)
-SITE_PACKAGES_LIBDEVICE_DIRS = (
-    "nvidia/cu13/nvvm/libdevice",  # CTK 13+
-    "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
-)
 
-FILENAME = "libdevice.10.bc"
+class BitcodeLibNotFoundError(DynamicLibNotFoundError):
+    """Raised when a bitcode library cannot be found."""
+    pass
+
+
+@dataclass(frozen=True)
+class LocatedBitcodeLib:
+    """Information about a located bitcode library.
+    """
+    name: str
+    abs_path: str
+    filename: str
+
+
+SUPPORTED_BITCODE_LIBS = {
+    "device": {
+        "filename": "libdevice.10.bc",
+        "rel_path": os.path.join("nvvm", "libdevice"),
+        "site_packages_dirs": (
+            "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+            "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
+        ),
+    },
+}
+
 if IS_WINDOWS:
-    bases = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
 else:
-    bases = ["/usr/local/cuda", "/opt/cuda"]
+    _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"]
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
@@ -32,19 +53,26 @@ def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str]
         attachments.append(f'  Directory does not exist: "{dir_path}"')
 
 
-class _FindLibdevice:
-    REL_PATH = os.path.join("nvvm", "libdevice")
-
-    def __init__(self) -> None:
+class _FindBitcodeLib:
+    def __init__(self, name: str) -> None:
+        if name not in SUPPORTED_BITCODE_LIBS:
+            raise ValueError(
+                f"Unknown bitcode library: '{name}'. "
+                f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
+            )
+        self.name = name
+        self.config = SUPPORTED_BITCODE_LIBS[name]
+        self.filename = self.config["filename"]
+        self.rel_path = self.config["rel_path"]
+        self.site_packages_dirs = self.config["site_packages_dirs"]
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
-        self.abs_path: str | None = None
 
     def try_site_packages(self) -> str | None:
-        for rel_dir in SITE_PACKAGES_LIBDEVICE_DIRS:
+        for rel_dir in self.site_packages_dirs:
             sub_dir = tuple(rel_dir.split("/"))
             for abs_dir in find_sub_dirs_all_sitepackages(sub_dir):
-                file_path = os.path.join(abs_dir, FILENAME)
+                file_path = os.path.join(abs_dir, self.filename)
                 if os.path.isfile(file_path):
                     return file_path
         return None
@@ -55,7 +83,7 @@ def try_with_conda_prefix(self) -> str | None:
             return None
 
         anchor = os.path.join(conda_prefix, "Library") if IS_WINDOWS else conda_prefix
-        file_path = os.path.join(anchor, self.REL_PATH, FILENAME)
+        file_path = os.path.join(anchor, self.rel_path, self.filename)
         if os.path.isfile(file_path):
             return file_path
         return None
@@ -66,29 +94,26 @@ def try_with_cuda_home(self) -> str | None:
             self.error_messages.append("CUDA_HOME/CUDA_PATH not set")
             return None
 
-        file_path = os.path.join(cuda_home, self.REL_PATH, FILENAME)
+        file_path = os.path.join(cuda_home, self.rel_path, self.filename)
         if os.path.isfile(file_path):
             return file_path
 
         _no_such_file_in_dir(
-            os.path.join(cuda_home, self.REL_PATH),
-            FILENAME,
+            os.path.join(cuda_home, self.rel_path),
+            self.filename,
             self.error_messages,
             self.attachments,
         )
         return None
 
     def try_common_paths(self) -> str | None:
-
-        for base in bases:
-            # Direct path
-            file_path = os.path.join(base, self.REL_PATH, FILENAME)
+        for base in _COMMON_BASES:
+            file_path = os.path.join(base, self.rel_path, self.filename)
             if os.path.isfile(file_path):
                 return file_path
-            # Versioned paths (e.g., /usr/local/cuda-13.0)
             for versioned in sorted(glob.glob(base + "*"), reverse=True):
                 if os.path.isdir(versioned):
-                    file_path = os.path.join(versioned, self.REL_PATH, FILENAME)
+                    file_path = os.path.join(versioned, self.rel_path, self.filename)
                     if os.path.isfile(file_path):
                         return file_path
         return None
@@ -96,12 +121,12 @@ def try_common_paths(self) -> str | None:
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
-        raise DynamicLibNotFoundError(f'Failure finding "{FILENAME}": {err}\n{att}')
+        raise BitcodeLibNotFoundError(f'Failure finding "{self.filename}": {err}\n{att}')
 
 
-def get_libdevice_path() -> str | None:
-    """Get the path to libdevice*.bc, or None if not found."""
-    finder = _FindLibdevice()
+def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
+    """Locate a bitcode library by name."""
+    finder = _FindBitcodeLib(name)
 
     abs_path = finder.try_site_packages()
     if abs_path is None:
@@ -111,16 +136,22 @@ def get_libdevice_path() -> str | None:
     if abs_path is None:
         abs_path = finder.try_common_paths()
 
-    return abs_path
+    if abs_path is None:
+        return None
+
+    return LocatedBitcodeLib(
+        name=name,
+        abs_path=abs_path,
+        filename=finder.filename,
+    )
 
 
 @functools.cache
-def find_libdevice() -> str:
-    """Find the path to libdevice*.bc.
-    Raises:
-        DynamicLibNotFoundError: If libdevice.10.bc cannot be found
-    """
-    path_or_none = get_libdevice_path()
-    if path_or_none is None:
-        raise DynamicLibNotFoundError(f"{FILENAME} not found")
-    return path_or_none
+def find_bitcode_lib(name: str) -> str:
+    """Find the absolute path to a bitcode library."""
+    result = locate_bitcode_lib(name)
+    if result is None:
+        config = SUPPORTED_BITCODE_LIBS.get(name, {})
+        filename = config.get("filename", name)
+        raise BitcodeLibNotFoundError(f"{filename} not found")
+    return result.abs_path
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index a1dfc0b64f..d93929991f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -5,20 +5,18 @@
 
 import pytest
 
-from cuda.pathfinder import find_libdevice
-from cuda.pathfinder._static_libs import find_libdevice as find_libdevice_module
+import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module
 
 FILENAME = "libdevice.10.bc"
 
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
-
 @pytest.fixture
 def clear_find_libdevice_cache():
-    find_libdevice.cache_clear()
+    find_libdevice_module.find_bitcode_lib.cache_clear()
     yield
-    find_libdevice.cache_clear()
+    find_libdevice_module.find_bitcode_lib.cache_clear()
 
 
 def _make_libdevice_file(dir_path: str) -> str:
@@ -44,10 +42,13 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
+    result = find_libdevice_module.locate_bitcode_lib("device")
 
-    assert result == expected_path
-    assert os.path.isfile(result)
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert result.name == "device"
+    assert result.filename == FILENAME
+    assert os.path.isfile(result.abs_path)
 
 
 # same for cu12/cu13
@@ -67,11 +68,11 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
-
-    assert result == expected_path
-    assert os.path.isfile(result)
+    result = find_libdevice_module.locate_bitcode_lib("device")
 
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert os.path.isfile(result.abs_path)
 
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
@@ -88,7 +89,33 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     monkeypatch.setenv("CUDA_HOME", str(tmp_path))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice()
+    result = find_libdevice_module.locate_bitcode_lib("device")
+
+    assert result is not None
+    assert result.abs_path == expected_path
+    assert os.path.isfile(result.abs_path)
+    
+@pytest.mark.usefixtures("clear_find_libdevice_cache")
+def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
+    rel_path = os.path.join("nvvm", "libdevice")
+    libdevice_dir = tmp_path / rel_path
+    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+
+    mocker.patch.object(
+        find_libdevice_module,
+        "find_sub_dirs_all_sitepackages",
+        return_value=[str(libdevice_dir)],
+    )
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+
+    result = find_libdevice_module.find_bitcode_lib("device")
 
     assert result == expected_path
-    assert os.path.isfile(result)
+    assert isinstance(result, str)
+
+
+def test_find_bitcode_lib_invalid_name():
+    with pytest.raises(ValueError, match="Unknown bitcode library"):
+        find_libdevice_module.locate_bitcode_lib("invalid")
\ No newline at end of file

From 68b33a219c22ce24de971c89c7fa1babea6bf6d4 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:27:07 +0000
Subject: [PATCH 44/79] refresh

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py     | 10 +++++-----
 cuda_pathfinder/tests/test_find_libdevice.py           |  7 +++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index f4422a5df1..727ab1577f 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -14,13 +14,14 @@
 
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
+
     pass
 
 
 @dataclass(frozen=True)
 class LocatedBitcodeLib:
-    """Information about a located bitcode library.
-    """
+    """Information about a located bitcode library."""
+
     name: str
     abs_path: str
     filename: str
@@ -31,7 +32,7 @@ class LocatedBitcodeLib:
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),
         "site_packages_dirs": (
-            "nvidia/cu13/nvvm/libdevice",       # CTK 13+
+            "nvidia/cu13/nvvm/libdevice",  # CTK 13+
             "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
         ),
     },
@@ -57,8 +58,7 @@ class _FindBitcodeLib:
     def __init__(self, name: str) -> None:
         if name not in SUPPORTED_BITCODE_LIBS:
             raise ValueError(
-                f"Unknown bitcode library: '{name}'. "
-                f"Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
+                f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
             )
         self.name = name
         self.config = SUPPORTED_BITCODE_LIBS[name]
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_libdevice.py
index d93929991f..78ed1dbe21 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_libdevice.py
@@ -12,6 +12,7 @@
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
+
 @pytest.fixture
 def clear_find_libdevice_cache():
     find_libdevice_module.find_bitcode_lib.cache_clear()
@@ -74,6 +75,7 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
+
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
@@ -94,7 +96,8 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
-    
+
+
 @pytest.mark.usefixtures("clear_find_libdevice_cache")
 def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
@@ -118,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_libdevice_module.locate_bitcode_lib("invalid")
\ No newline at end of file
+        find_libdevice_module.locate_bitcode_lib("invalid")

From 631a1138f5a13a58b1f6024b5e9d8ed4019ee136 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Feb 2026 09:29:01 +0000
Subject: [PATCH 45/79] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/cuda/pathfinder/__init__.py                 | 6 ++++++
 .../cuda/pathfinder/_static_libs/find_libdevice.py          | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index cc1eda8d2d..80dff6d486 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -21,8 +21,14 @@
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
 from cuda.pathfinder._static_libs.find_libdevice import (
     BitcodeLibNotFoundError as BitcodeLibNotFoundError,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     LocatedBitcodeLib as LocatedBitcodeLib,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     find_bitcode_lib as find_bitcode_lib,
+)
+from cuda.pathfinder._static_libs.find_libdevice import (
     locate_bitcode_lib as locate_bitcode_lib,
 )
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 727ab1577f..a45f03a19b 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -15,8 +15,6 @@
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
-    pass
-
 
 @dataclass(frozen=True)
 class LocatedBitcodeLib:

From 7a02aee8c974070e0513fc098cc31480f9217903 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:36:23 +0000
Subject: [PATCH 46/79] fix mypy errirs

---
 .../pathfinder/_static_libs/find_libdevice.py | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index a45f03a19b..7e3c1a7805 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -5,6 +5,7 @@
 import glob
 import os
 from dataclasses import dataclass
+from typing import TypedDict
 
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
@@ -25,21 +26,27 @@ class LocatedBitcodeLib:
     filename: str
 
 
-SUPPORTED_BITCODE_LIBS = {
+class _BitcodeLibConfig(TypedDict):
+    filename: str
+    rel_path: str
+    site_packages_dirs: tuple[str, ...]
+
+
+SUPPORTED_BITCODE_LIBS: dict[str, _BitcodeLibConfig] = {
     "device": {
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),
         "site_packages_dirs": (
-            "nvidia/cu13/nvvm/libdevice",  # CTK 13+
-            "nvidia/cuda_nvcc/nvvm/libdevice",  # CTK <13
+            "nvidia/cu13/nvvm/libdevice",
+            "nvidia/cuda_nvcc/nvvm/libdevice",
         ),
     },
 }
 
 if IS_WINDOWS:
-    _COMMON_BASES = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
 else:
-    _COMMON_BASES = ["/usr/local/cuda", "/opt/cuda"]
+    _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"]
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
@@ -58,11 +65,11 @@ def __init__(self, name: str) -> None:
             raise ValueError(
                 f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
             )
-        self.name = name
-        self.config = SUPPORTED_BITCODE_LIBS[name]
-        self.filename = self.config["filename"]
-        self.rel_path = self.config["rel_path"]
-        self.site_packages_dirs = self.config["site_packages_dirs"]
+        self.name: str = name
+        self.config: _BitcodeLibConfig = SUPPORTED_BITCODE_LIBS[name]
+        self.filename: str = self.config["filename"]
+        self.rel_path: str = self.config["rel_path"]
+        self.site_packages_dirs: tuple[str, ...] = self.config["site_packages_dirs"]
         self.error_messages: list[str] = []
         self.attachments: list[str] = []
 
@@ -149,7 +156,7 @@ def find_bitcode_lib(name: str) -> str:
     """Find the absolute path to a bitcode library."""
     result = locate_bitcode_lib(name)
     if result is None:
-        config = SUPPORTED_BITCODE_LIBS.get(name, {})
-        filename = config.get("filename", name)
+        config = SUPPORTED_BITCODE_LIBS.get(name)
+        filename = config["filename"] if config else name
         raise BitcodeLibNotFoundError(f"{filename} not found")
     return result.abs_path

From b8f2eb076e2b703bae7883af6c13842c7c218ea2 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:42:26 +0000
Subject: [PATCH 47/79] fix base var declaration

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py       | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 7e3c1a7805..5d51f733d8 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -43,10 +43,11 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
-if IS_WINDOWS:
-    _COMMON_BASES: list[str] = [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-else:
-    _COMMON_BASES: list[str] = ["/usr/local/cuda", "/opt/cuda"]
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
 
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:

From 434b3c904c84cdbbd511fe64d6b2ed38be5d34d5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:49:58 +0000
Subject: [PATCH 48/79] format changes

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 5d51f733d8..102b3b9f77 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -13,6 +13,13 @@
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
+
+
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
@@ -43,12 +50,6 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
-_COMMON_BASES: list[str] = (
-    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-    if IS_WINDOWS
-    else ["/usr/local/cuda", "/opt/cuda"]
-)
-
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")

From 9f0a3199978b6c042c0c35f0dd6121621a24344a Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 13 Feb 2026 09:51:56 +0000
Subject: [PATCH 49/79] format changes

---
 .../cuda/pathfinder/_static_libs/find_libdevice.py  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
index 102b3b9f77..5d51f733d8 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
@@ -13,13 +13,6 @@
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 
-_COMMON_BASES: list[str] = (
-    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-    if IS_WINDOWS
-    else ["/usr/local/cuda", "/opt/cuda"]
-)
-
-
 class BitcodeLibNotFoundError(DynamicLibNotFoundError):
     """Raised when a bitcode library cannot be found."""
 
@@ -50,6 +43,12 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
+_COMMON_BASES: list[str] = (
+    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
+    if IS_WINDOWS
+    else ["/usr/local/cuda", "/opt/cuda"]
+)
+
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")

From f499487a0ea6a2f63553d0d31802dd8759837020 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 06:27:50 +0000
Subject: [PATCH 50/79] rename to bitcodelib

---
 cuda_pathfinder/cuda/pathfinder/__init__.py   |  8 +--
 ...{find_libdevice.py => find_bitcode_lib.py} |  0
 ..._libdevice.py => test_find_bitcode_lib.py} | 64 +++++++++----------
 3 files changed, 36 insertions(+), 36 deletions(-)
 rename cuda_pathfinder/cuda/pathfinder/_static_libs/{find_libdevice.py => find_bitcode_lib.py} (100%)
 rename cuda_pathfinder/tests/{test_find_libdevice.py => test_find_bitcode_lib.py} (57%)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 80dff6d486..29a3ff4979 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,16 +19,16 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     BitcodeLibNotFoundError as BitcodeLibNotFoundError,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     LocatedBitcodeLib as LocatedBitcodeLib,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     find_bitcode_lib as find_bitcode_lib,
 )
-from cuda.pathfinder._static_libs.find_libdevice import (
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
     locate_bitcode_lib as locate_bitcode_lib,
 )
 
diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
similarity index 100%
rename from cuda_pathfinder/cuda/pathfinder/_static_libs/find_libdevice.py
rename to cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
diff --git a/cuda_pathfinder/tests/test_find_libdevice.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
similarity index 57%
rename from cuda_pathfinder/tests/test_find_libdevice.py
rename to cuda_pathfinder/tests/test_find_bitcode_lib.py
index 78ed1dbe21..b9ab71529f 100644
--- a/cuda_pathfinder/tests/test_find_libdevice.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-import cuda.pathfinder._static_libs.find_libdevice as find_libdevice_module
+import cuda.pathfinder._static_libs.find_bitcode_lib as find_bitcode_lib_module
 
 FILENAME = "libdevice.10.bc"
 
@@ -14,13 +14,13 @@
 
 
 @pytest.fixture
-def clear_find_libdevice_cache():
-    find_libdevice_module.find_bitcode_lib.cache_clear()
+def clear_find_bitcode_lib_cache():
+    find_bitcode_lib_module.find_bitcode_lib.cache_clear()
     yield
-    find_libdevice_module.find_bitcode_lib.cache_clear()
+    find_bitcode_lib_module.find_bitcode_lib.cache_clear()
 
 
-def _make_libdevice_file(dir_path: str) -> str:
+def _make_bitcode_lib_file(dir_path: str) -> str:
     os.makedirs(dir_path, exist_ok=True)
     file_path = os.path.join(dir_path, FILENAME)
     with open(file_path, "wb"):
@@ -29,21 +29,21 @@ def _make_libdevice_file(dir_path: str) -> str:
 
 
 @pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
-    libdevice_dir = tmp_path.joinpath(*rel_dir.split("/"))
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
+    bitcode_lib_dir = tmp_path.joinpath(*rel_dir.split("/"))
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[str(libdevice_dir)],
+        return_value=[str(bitcode_lib_dir)],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
@@ -53,15 +53,15 @@ def test_find_libdevice_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir
 
 
 # same for cu12/cu13
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_conda(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
-    mocker.patch.object(find_libdevice_module, "IS_WINDOWS", False)
+    mocker.patch.object(find_bitcode_lib_module, "IS_WINDOWS", False)
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
         return_value=[],
     )
@@ -69,21 +69,21 @@ def test_find_libdevice_via_conda(monkeypatch, mocker, tmp_path):
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
 
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
-def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_via_cuda_home(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
         return_value=[],
     )
@@ -91,29 +91,29 @@ def test_find_libdevice_via_cuda_home(monkeypatch, mocker, tmp_path):
     monkeypatch.setenv("CUDA_HOME", str(tmp_path))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.locate_bitcode_lib("device")
+    result = find_bitcode_lib_module.locate_bitcode_lib("device")
 
     assert result is not None
     assert result.abs_path == expected_path
     assert os.path.isfile(result.abs_path)
 
 
-@pytest.mark.usefixtures("clear_find_libdevice_cache")
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")
-    libdevice_dir = tmp_path / rel_path
-    expected_path = str(_make_libdevice_file(str(libdevice_dir)))
+    bitcode_lib_dir = tmp_path / rel_path
+    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
     mocker.patch.object(
-        find_libdevice_module,
+        find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[str(libdevice_dir)],
+        return_value=[str(bitcode_lib_dir)],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_libdevice_module.find_bitcode_lib("device")
+    result = find_bitcode_lib_module.find_bitcode_lib("device")
 
     assert result == expected_path
     assert isinstance(result, str)
@@ -121,4 +121,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_libdevice_module.locate_bitcode_lib("invalid")
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")

From 5ee41fed0124ebd155442d83035b1195fba5af7f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 13:03:31 +0000
Subject: [PATCH 51/79] refresh

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 1 +
 cuda_core/cuda/core/_program.pyx                          | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 37b39486ab..057b52f5eb 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -11,6 +11,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 4f0e0c8b78..06896d6ddc 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -462,8 +462,8 @@ def _get_nvvm_module():
 
 def _find_libdevice_path():
     """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
-    from cuda.pathfinder import get_libdevice_path
-    return get_libdevice_path()
+    from cuda.pathfinder import find_bitcode_lib
+    return find_bitcode_lib("device")
 
 
 cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:

From 606652d4c9f921634eab990e4a507fc0b9573032 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 13:08:32 +0000
Subject: [PATCH 52/79] refresh

---
 cuda_bindings/cuda/bindings/nvvm.pyx | 1 +
 cuda_core/cuda/core/_program.pyx     | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index bab2dfc3b9..8fb3849db3 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -12,6 +12,7 @@ from ._internal.utils cimport (get_buffer_pointer, get_nested_resource_ptr,
 from enum import IntEnum as _IntEnum
 
 
+
 ###############################################################################
 # Enum
 ###############################################################################
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 06896d6ddc..9c0e19be44 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -729,11 +729,6 @@ cdef object Program_compile_nvvm(Program self, str target_type, object logs):
     # Load libdevice if requested  - following numba-cuda
     if self._use_libdevice:
         libdevice_path = _find_libdevice_path()
-        if libdevice_path is None:
-            raise RuntimeError(
-                "use_libdevice=True but could not find libdevice.10.bc. "
-                "Ensure CUDA toolkit is installed."
-            )
         with open(libdevice_path, "rb") as f:
             libdevice_bytes = f.read()
         libdevice_ptr = <const char*>libdevice_bytes

From 88d5278f383cd437d411d452f75f99b51d1e4580 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 13:11:49 +0000
Subject: [PATCH 53/79] revert

---
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx | 1 +
 cuda_bindings/cuda/bindings/nvvm.pyx                 | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 062a7abe1c..2d03097235 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -11,6 +11,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
+
 ###############################################################################
 # Extern
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index 8fb3849db3..bab2dfc3b9 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -12,7 +12,6 @@ from ._internal.utils cimport (get_buffer_pointer, get_nested_resource_ptr,
 from enum import IntEnum as _IntEnum
 
 
-
 ###############################################################################
 # Enum
 ###############################################################################

From 4cd7755ecd036266bd53b3e2c29fae5d7839311c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 17:22:21 +0000
Subject: [PATCH 54/79] refresh test_nvvm to use cupy test helpers

---
 cuda_bindings/tests/test_nvvm.py | 132 +------------------------------
 1 file changed, 1 insertion(+), 131 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 060a5268be..39380cfe7c 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -8,137 +8,7 @@
 
 import pytest
 from cuda.bindings import nvvm
-
-MINIMAL_NVVMIR_TXT_TEMPLATE = b"""\
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-target triple = "nvptx64-nvidia-cuda"
-
-define void @kernel() {
-entry:
-  ret void
-}
-
-!nvvm.annotations = !{!0}
-!0 = !{void ()* @kernel, !"kernel", i32 1}
-
-!nvvmir.version = !{!1}
-!1 = !{i32 %d, i32 0, i32 %d, i32 0}
-"""  # noqa: E501
-
-MINIMAL_NVVMIR_BITCODE_STATIC = {
-    (1, 3):  # (major, debug_major)
-    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000"
-    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
-    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
-    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
-    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
-    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
-    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
-    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
-    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
-    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
-    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
-    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000"
-    "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c"
-    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
-    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
-    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
-    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
-    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
-    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
-    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
-    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
-    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
-    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
-    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
-    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
-    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
-    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
-    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
-    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
-    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
-    "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242"
-    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40"
-    "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b"
-    "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6"
-    "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4"
-    "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc"
-    "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e"
-    "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000"
-    "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022"
-    "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000"
-    "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000"
-    "1f000000080000000600000000000000700000000000000000000000010000000000000000000000"
-    "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000"
-    "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00"
-    "00000000",
-    (2, 3):  # (major, debug_major)
-    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
-    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
-    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
-    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
-    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
-    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
-    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
-    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
-    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
-    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
-    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
-    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000"
-    "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c"
-    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
-    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
-    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
-    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
-    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
-    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
-    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
-    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
-    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
-    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
-    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
-    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
-    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
-    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
-    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
-    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
-    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
-    "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242"
-    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320"
-    "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb"
-    "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339"
-    "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f"
-    "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c"
-    "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150"
-    "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000"
-    "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000"
-    "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000"
-    "060000004c000000010000005800000000000000580000000100000070000000000000000c000000"
-    "130000001f0000000800000006000000000000007000000000000000000000000100000000000000"
-    "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467"
-    "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269"
-    "6e673e0000000000",
-}
-
-
-@pytest.fixture(params=("txt", "bitcode_static"))
-def minimal_nvvmir(request):
-    major, minor, debug_major, debug_minor = nvvm.ir_version()
-
-    if request.param == "txt":
-        return MINIMAL_NVVMIR_TXT_TEMPLATE % (major, debug_major)
-
-    bitcode_static_binascii = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major))
-    if bitcode_static_binascii:
-        return binascii.unhexlify(bitcode_static_binascii)
-    raise RuntimeError(
-        "Static bitcode for NVVM IR version "
-        f"{major}.{debug_major} is not available in this test.\n"
-        "Maintainers: Please run the helper script to generate it and add the "
-        "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
-        "  ../../toolshed/build_static_bitcode_input.py"
-    )
+from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
 
 
 @pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])

From c562d28df70b65663d417ad8c57bd3ed0e2d5621 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 17:28:40 +0000
Subject: [PATCH 55/79] refresh test_nvvm

---
 cuda_bindings/tests/test_nvvm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 39380cfe7c..a46df8c66c 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -91,7 +91,7 @@ def test_c_or_v_program_fail_invalid_ir(compile_or_verify):
         assert get_program_log(prog) == "FileNameHere.ll (1, 0): parse expected top-level entity\x00"
 
 
-def test_c_or_v_program_fail_bad_option(minimal_nvvmir, compile_or_verify):
+def test_c_or_v_program_fail_bad_option(minimal_nvvmir, compile_or_verify):  # noqa: F401, F811
     with nvvm_program() as prog:
         nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
         with pytest.raises(nvvm.nvvmError, match=match_exact("ERROR_INVALID_OPTION (7)")):
@@ -116,7 +116,7 @@ def test_get_buffer_empty(get_size, get_buffer):
 
 
 @pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
-def test_compile_program_with_minimal_nvvm_ir(minimal_nvvmir, options):
+def test_compile_program_with_minimal_nvvm_ir(minimal_nvvmir, options):  # noqa: F401, F811
     with nvvm_program() as prog:
         nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
         try:
@@ -136,7 +136,7 @@ def test_compile_program_with_minimal_nvvm_ir(minimal_nvvmir, options):
 
 
 @pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
-def test_verify_program_with_minimal_nvvm_ir(minimal_nvvmir, options):
+def test_verify_program_with_minimal_nvvm_ir(minimal_nvvmir, options):  # noqa: F401, F811
     with nvvm_program() as prog:
         nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
         nvvm.verify_program(prog, len(options), options)

From 150fd26f5953b699a97fbb2f2afeafd5072c2cee Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 17:30:00 +0000
Subject: [PATCH 56/79] [pre-commit.ci] auto code formatting

---
 cuda_bindings/tests/test_nvvm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index a46df8c66c..81992616ba 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import binascii
 import re
 from contextlib import contextmanager
 

From afe17b2387a455b2bb0bd24ea602bc52f5bb76f9 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 18:04:22 +0000
Subject: [PATCH 57/79] refresh with actual test

---
 .../tests/test_find_bitcode_lib.py            | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index b9ab71529f..fbcb28dd4f 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -6,12 +6,19 @@
 import pytest
 
 import cuda.pathfinder._static_libs.find_bitcode_lib as find_bitcode_lib_module
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
+    SUPPORTED_BITCODE_LIBS,
+    find_bitcode_lib,
+    locate_bitcode_lib,
+)
 
 FILENAME = "libdevice.10.bc"
 
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
+STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "")
+
 
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
@@ -28,6 +35,31 @@ def _make_bitcode_lib_file(dir_path: str) -> str:
     return file_path
 
 
+def _located_bitcode_lib_asserts(located_bitcode_lib):
+    """Common assertions for a located bitcode library."""
+    assert located_bitcode_lib is not None
+    assert isinstance(located_bitcode_lib.name, str)
+    assert isinstance(located_bitcode_lib.abs_path, str)
+    assert isinstance(located_bitcode_lib.filename, str)
+    assert os.path.isfile(located_bitcode_lib.abs_path)
+
+
+@pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS.keys())
+def test_locate_bitcode_lib(info_summary_append, libname):
+    lib_path = find_bitcode_lib(libname) if locate_bitcode_lib(libname) else None
+    located_lib = locate_bitcode_lib(libname)
+    assert lib_path is None if not located_lib else lib_path == located_lib.abs_path
+
+    info_summary_append(f"{lib_path=!r}")
+    if lib_path:
+        _located_bitcode_lib_asserts(located_lib)
+        assert os.path.isfile(lib_path)
+        expected_filename = SUPPORTED_BITCODE_LIBS[libname]["filename"]
+        assert os.path.basename(lib_path) == expected_filename
+    if STRICTNESS == "all_must_work":
+        assert lib_path is not None
+
+
 @pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
@@ -52,7 +84,6 @@ def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_d
     assert os.path.isfile(result.abs_path)
 
 
-# same for cu12/cu13
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 def test_find_bitcode_lib_via_conda(monkeypatch, mocker, tmp_path):
     rel_path = os.path.join("nvvm", "libdevice")

From 0f1c3934765a05ca1fb036d393c44a440cebab75 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 18:24:07 +0000
Subject: [PATCH 58/79] rm try_common_paths

---
 .../_static_libs/find_bitcode_lib.py          | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 5d51f733d8..eb1d3b88a6 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
-import glob
 import os
 from dataclasses import dataclass
 from typing import TypedDict
@@ -43,12 +42,6 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
-_COMMON_BASES: list[str] = (
-    [r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA", r"C:\CUDA"]
-    if IS_WINDOWS
-    else ["/usr/local/cuda", "/opt/cuda"]
-)
-
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
@@ -112,18 +105,6 @@ def try_with_cuda_home(self) -> str | None:
         )
         return None
 
-    def try_common_paths(self) -> str | None:
-        for base in _COMMON_BASES:
-            file_path = os.path.join(base, self.rel_path, self.filename)
-            if os.path.isfile(file_path):
-                return file_path
-            for versioned in sorted(glob.glob(base + "*"), reverse=True):
-                if os.path.isdir(versioned):
-                    file_path = os.path.join(versioned, self.rel_path, self.filename)
-                    if os.path.isfile(file_path):
-                        return file_path
-        return None
-
     def raise_not_found_error(self) -> None:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
@@ -139,8 +120,6 @@ def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
         abs_path = finder.try_with_conda_prefix()
     if abs_path is None:
         abs_path = finder.try_with_cuda_home()
-    if abs_path is None:
-        abs_path = finder.try_common_paths()
 
     if abs_path is None:
         return None

From 9e7e75cbdbd346ae5eccaff463d5970b50a1c3e7 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 19:02:38 +0000
Subject: [PATCH 59/79] refresh 1

---
 .../cuda/pathfinder/_static_libs/find_bitcode_lib.py        | 2 +-
 cuda_pathfinder/tests/test_find_bitcode_lib.py              | 6 +++---
 cuda_pathfinder/tests/test_find_nvidia_binaries.py          | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index eb1d3b88a6..88d0e883fd 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -12,7 +12,7 @@
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
 
 
-class BitcodeLibNotFoundError(DynamicLibNotFoundError):
+class BitcodeLibNotFoundError(RuntimeError):
     """Raised when a bitcode library cannot be found."""
 
 
diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index fbcb28dd4f..68a2b1970f 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -17,8 +17,8 @@
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
-STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "")
-
+STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
+assert STRICTNESS in ("see_what_works", "all_must_work")
 
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
@@ -152,4 +152,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_bitcode_lib_module.locate_bitcode_lib("invalid")
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")
\ No newline at end of file
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index 4f9eef223a..4a775a82ba 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -14,6 +14,8 @@
     SUPPORTED_BINARIES_ALL,
 )
 
+STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
+assert STRICTNESS in ("see_what_works", "all_must_work")
 
 def test_unknown_utility_name():
     with pytest.raises(UnsupportedBinaryError, match=r"'unknown-utility' is not supported"):

From 56dc23fab652fb1eabe304a5028ab1b1cd647f47 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 19:03:55 +0000
Subject: [PATCH 60/79] [pre-commit.ci] auto code formatting

---
 .../cuda/pathfinder/_static_libs/find_bitcode_lib.py           | 1 -
 cuda_pathfinder/tests/test_find_bitcode_lib.py                 | 3 ++-
 cuda_pathfinder/tests/test_find_nvidia_binaries.py             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 88d0e883fd..791be76345 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -6,7 +6,6 @@
 from dataclasses import dataclass
 from typing import TypedDict
 
-from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 68a2b1970f..06f0b475e0 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -20,6 +20,7 @@
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
+
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
     find_bitcode_lib_module.find_bitcode_lib.cache_clear()
@@ -152,4 +153,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_bitcode_lib_module.locate_bitcode_lib("invalid")
\ No newline at end of file
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index 4a775a82ba..c017ecb7d1 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -17,6 +17,7 @@
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
+
 def test_unknown_utility_name():
     with pytest.raises(UnsupportedBinaryError, match=r"'unknown-utility' is not supported"):
         find_nvidia_binary_utility("unknown-utility")

From a19e9eef4a480e0d9d602b7ba2989f14795985ab Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 19:14:12 +0000
Subject: [PATCH 61/79] refresh

---
 ci/tools/run-tests                                 | 5 +++--
 cuda_pathfinder/tests/test_find_bitcode_lib.py     | 5 ++---
 cuda_pathfinder/tests/test_find_nvidia_binaries.py | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index 4f0c6d1d84..da089cf0af 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -30,8 +30,9 @@ popd
 if [[ "${test_module}" == "pathfinder" ]]; then
   pushd ./cuda_pathfinder
   echo "Running pathfinder tests with " \
-      "LD:${CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS} " \
-      "FH:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS}"
+    "LD:${CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS} " \
+    "FH:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS} " \
+    "BC:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS}"
   pytest -ra -s -v --durations=0 tests/ |& tee /tmp/pathfinder_test_log.txt
   # Fail if no "INFO test_" lines are found; capture line count otherwise
   line_count=$(grep '^INFO test_' /tmp/pathfinder_test_log.txt | wc -l)
diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 06f0b475e0..170fbc6279 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -17,10 +17,9 @@
 SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
 SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
 
-STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
+STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
-
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
     find_bitcode_lib_module.find_bitcode_lib.cache_clear()
@@ -153,4 +152,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_bitcode_lib_module.locate_bitcode_lib("invalid")
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")
\ No newline at end of file
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index c017ecb7d1..4a775a82ba 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -17,7 +17,6 @@
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
-
 def test_unknown_utility_name():
     with pytest.raises(UnsupportedBinaryError, match=r"'unknown-utility' is not supported"):
         find_nvidia_binary_utility("unknown-utility")

From 2748ce03cc9d9af04e267b2d4f88f916bf6337cb Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 19:18:35 +0000
Subject: [PATCH 62/79] refresh

---
 .github/workflows/test-wheel-linux.yml   | 2 ++
 .github/workflows/test-wheel-windows.yml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 0666893746..3d680ef6a4 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -251,6 +251,7 @@ jobs:
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: see_what_works
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: see_what_works
+          CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: see_what_works
         run: run-tests pathfinder
 
       - name: Run cuda.bindings tests
@@ -296,4 +297,5 @@ jobs:
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: all_must_work
+          CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work
         run: run-tests pathfinder
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 2b6ddd8eea..b996aa3ff8 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -224,6 +224,7 @@ jobs:
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: see_what_works
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: see_what_works
+          CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: see_what_works
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests pathfinder
 
@@ -273,5 +274,6 @@ jobs:
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
           CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS: all_must_work
+          CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests pathfinder

From 8f44791fac265004f463ff4b080e2d21713933ca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 19:19:30 +0000
Subject: [PATCH 63/79] [pre-commit.ci] auto code formatting

---
 cuda_pathfinder/tests/test_find_bitcode_lib.py     | 3 ++-
 cuda_pathfinder/tests/test_find_nvidia_binaries.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 170fbc6279..8d1a7f0333 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -20,6 +20,7 @@
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
+
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
     find_bitcode_lib_module.find_bitcode_lib.cache_clear()
@@ -152,4 +153,4 @@ def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
 
 def test_find_bitcode_lib_invalid_name():
     with pytest.raises(ValueError, match="Unknown bitcode library"):
-        find_bitcode_lib_module.locate_bitcode_lib("invalid")
\ No newline at end of file
+        find_bitcode_lib_module.locate_bitcode_lib("invalid")
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index 4a775a82ba..c017ecb7d1 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -17,6 +17,7 @@
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
+
 def test_unknown_utility_name():
     with pytest.raises(UnsupportedBinaryError, match=r"'unknown-utility' is not supported"):
         find_nvidia_binary_utility("unknown-utility")

From 6b1fc9faf1debc974d75df6c94713bf4153bfd84 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 16 Feb 2026 19:20:57 +0000
Subject: [PATCH 64/79] refresh

---
 .../pathfinder/_static_libs/find_bitcode_lib.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 791be76345..79969178be 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -24,13 +24,13 @@ class LocatedBitcodeLib:
     filename: str
 
 
-class _BitcodeLibConfig(TypedDict):
+class _BitcodeLibInfo(TypedDict):  # Renamed: Config -> Info
     filename: str
     rel_path: str
     site_packages_dirs: tuple[str, ...]
 
 
-SUPPORTED_BITCODE_LIBS: dict[str, _BitcodeLibConfig] = {
+_SUPPORTED_BITCODE_LIBS_INFO: dict[str, _BitcodeLibInfo] = {  # Renamed: added underscore prefix
     "device": {
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),
@@ -41,6 +41,9 @@ class _BitcodeLibConfig(TypedDict):
     },
 }
 
+# Public API: just the supported library names
+SUPPORTED_BITCODE_LIBS: tuple[str, ...] = tuple(sorted(_SUPPORTED_BITCODE_LIBS_INFO.keys()))
+
 
 def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str], attachments: list[str]) -> None:
     error_messages.append(f"No such file: {os.path.join(dir_path, filename)}")
@@ -54,12 +57,12 @@ def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str]
 
 class _FindBitcodeLib:
     def __init__(self, name: str) -> None:
-        if name not in SUPPORTED_BITCODE_LIBS:
+        if name not in _SUPPORTED_BITCODE_LIBS_INFO:  # Updated reference
             raise ValueError(
-                f"Unknown bitcode library: '{name}'. Supported: {', '.join(sorted(SUPPORTED_BITCODE_LIBS.keys()))}"
+                f"Unknown bitcode library: '{name}'. Supported: {', '.join(SUPPORTED_BITCODE_LIBS)}"
             )
         self.name: str = name
-        self.config: _BitcodeLibConfig = SUPPORTED_BITCODE_LIBS[name]
+        self.config: _BitcodeLibInfo = _SUPPORTED_BITCODE_LIBS_INFO[name]  # Updated reference
         self.filename: str = self.config["filename"]
         self.rel_path: str = self.config["rel_path"]
         self.site_packages_dirs: tuple[str, ...] = self.config["site_packages_dirs"]
@@ -135,7 +138,7 @@ def find_bitcode_lib(name: str) -> str:
     """Find the absolute path to a bitcode library."""
     result = locate_bitcode_lib(name)
     if result is None:
-        config = SUPPORTED_BITCODE_LIBS.get(name)
-        filename = config["filename"] if config else name
+        info = _SUPPORTED_BITCODE_LIBS_INFO.get(name)  # Updated reference
+        filename = info["filename"] if info else name
         raise BitcodeLibNotFoundError(f"{filename} not found")
     return result.abs_path

From 404963b0f9fde9257be7bc4223520eba601eac4b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 19:22:06 +0000
Subject: [PATCH 65/79] [pre-commit.ci] auto code formatting

---
 .../cuda/pathfinder/_static_libs/find_bitcode_lib.py          | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 79969178be..f752290f96 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -58,9 +58,7 @@ def _no_such_file_in_dir(dir_path: str, filename: str, error_messages: list[str]
 class _FindBitcodeLib:
     def __init__(self, name: str) -> None:
         if name not in _SUPPORTED_BITCODE_LIBS_INFO:  # Updated reference
-            raise ValueError(
-                f"Unknown bitcode library: '{name}'. Supported: {', '.join(SUPPORTED_BITCODE_LIBS)}"
-            )
+            raise ValueError(f"Unknown bitcode library: '{name}'. Supported: {', '.join(SUPPORTED_BITCODE_LIBS)}")
         self.name: str = name
         self.config: _BitcodeLibInfo = _SUPPORTED_BITCODE_LIBS_INFO[name]  # Updated reference
         self.filename: str = self.config["filename"]

From bc27ed671fcf7a26e85e0cd469725e7452474250 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 17 Feb 2026 08:16:55 +0000
Subject: [PATCH 66/79] refresh pathfinder test after rebae

---
 cuda_pathfinder/tests/test_find_bitcode_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 8d1a7f0333..7babcd5b95 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -45,7 +45,7 @@ def _located_bitcode_lib_asserts(located_bitcode_lib):
     assert os.path.isfile(located_bitcode_lib.abs_path)
 
 
-@pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS.keys())
+@pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS)
 def test_locate_bitcode_lib(info_summary_append, libname):
     lib_path = find_bitcode_lib(libname) if locate_bitcode_lib(libname) else None
     located_lib = locate_bitcode_lib(libname)
@@ -55,7 +55,7 @@ def test_locate_bitcode_lib(info_summary_append, libname):
     if lib_path:
         _located_bitcode_lib_asserts(located_lib)
         assert os.path.isfile(lib_path)
-        expected_filename = SUPPORTED_BITCODE_LIBS[libname]["filename"]
+        expected_filename = located_lib.filename
         assert os.path.basename(lib_path) == expected_filename
     if STRICTNESS == "all_must_work":
         assert lib_path is not None

From 82c65a10e7bd8ab2d26841333ba7d064ff981a96 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 11:13:11 -0800
Subject: [PATCH 67/79] Uniform handling of cuda_python_test_helpers dependency
 in cuda_bindings, cuda_core

---
 cuda_bindings/tests/conftest.py     | 10 ++++++++++
 cuda_core/tests/conftest.py         |  9 +++++++++
 cuda_core/tests/helpers/__init__.py |  9 +--------
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
index f0a426406a..d651673cbc 100644
--- a/cuda_bindings/tests/conftest.py
+++ b/cuda_bindings/tests/conftest.py
@@ -1,9 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import pathlib
+import sys
+
 import cuda.bindings.driver as cuda
 import pytest
 
+# Import shared test helpers for tests across subprojects.
+_test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
+if _test_helpers_root.is_dir():
+    test_helpers_root = str(_test_helpers_root)
+    if test_helpers_root not in sys.path:
+        sys.path.insert(0, test_helpers_root)
+
 
 @pytest.fixture(scope="module")
 def cuda_driver():
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 340e632719..95ec3af8e2 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -3,6 +3,15 @@
 
 import multiprocessing
 import os
+import pathlib
+import sys
+
+# Import shared test helpers for tests across subprojects.
+_test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
+if _test_helpers_root.is_dir():
+    test_helpers_root = str(_test_helpers_root)
+    if test_helpers_root not in sys.path:
+        sys.path.insert(0, test_helpers_root)
 
 import helpers
 import pytest
diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
index ad9d281c16..f83a5b9a11 100644
--- a/cuda_core/tests/helpers/__init__.py
+++ b/cuda_core/tests/helpers/__init__.py
@@ -3,8 +3,6 @@
 
 import functools
 import os
-import pathlib
-import sys
 from typing import Union
 
 from cuda.core._utils.cuda_utils import handle_return
@@ -22,12 +20,7 @@
             CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
 
 
-try:
-    from cuda_python_test_helpers import *  # noqa: F403
-except ImportError:
-    # Import shared platform helpers for tests across repos
-    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"))
-    from cuda_python_test_helpers import *  # noqa: F403
+from cuda_python_test_helpers import *  # noqa: F403
 
 
 @functools.cache

From 9bc68bb2cdf9841ca8f721ed5815e16d7cc335ea Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 11:20:26 -0800
Subject: [PATCH 68/79] pre-commit cleanup

---
 cuda_core/tests/conftest.py         | 24 ++++++++++++++----------
 cuda_core/tests/helpers/__init__.py |  5 ++---
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 95ec3af8e2..b8bf28b6d4 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -6,14 +6,6 @@
 import pathlib
 import sys
 
-# Import shared test helpers for tests across subprojects.
-_test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
-if _test_helpers_root.is_dir():
-    test_helpers_root = str(_test_helpers_root)
-    if test_helpers_root not in sys.path:
-        sys.path.insert(0, test_helpers_root)
-
-import helpers
 import pytest
 
 try:
@@ -34,6 +26,13 @@
 )
 from cuda.core._utils.cuda_utils import handle_return
 
+# Import shared test helpers for tests across subprojects.
+_test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
+if _test_helpers_root.is_dir():
+    test_helpers_root = str(_test_helpers_root)
+    if test_helpers_root not in sys.path:
+        sys.path.insert(0, test_helpers_root)
+
 
 def skip_if_pinned_memory_unsupported(device):
     try:
@@ -139,7 +138,9 @@ def ipc_device():
         pytest.skip("Device does not support IPC")
 
     # Skip on WSL or if driver rejects IPC-enabled mempool creation on this platform/device
-    if helpers.IS_WSL or not helpers.supports_ipc_mempool(device):
+    from helpers import IS_WSL, supports_ipc_mempool
+
+    if IS_WSL or not supports_ipc_mempool(device):
         pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
 
     return device
@@ -236,4 +237,7 @@ def test_something(memory_resource_factory):
     return request.param
 
 
-skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
+skipif_need_cuda_headers = pytest.mark.skipif(
+    not os.path.isdir(os.path.join(os.environ.get("CUDA_PATH", ""), "include")),
+    reason="need CUDA header",
+)
diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
index f83a5b9a11..ea9146ca15 100644
--- a/cuda_core/tests/helpers/__init__.py
+++ b/cuda_core/tests/helpers/__init__.py
@@ -7,6 +7,8 @@
 
 from cuda.core._utils.cuda_utils import handle_return
 
+from cuda_python_test_helpers import *  # noqa: F403
+
 CUDA_PATH = os.environ.get("CUDA_PATH")
 CUDA_INCLUDE_PATH = None
 CCCL_INCLUDE_PATHS = None
@@ -20,9 +22,6 @@
             CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
 
 
-from cuda_python_test_helpers import *  # noqa: F403
-
-
 @functools.cache
 def supports_ipc_mempool(device_id: Union[int, object]) -> bool:
     """Return True if mempool IPC via POSIX file descriptor is supported.

From 38e87d2000c9e4132c3c5b7449dd25ed196340d5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 11:34:16 -0800
Subject: [PATCH 69/79] Prefer installed cuda-python-test-helpers in test
 bootstrap.

Avoid silently overriding site-packages with checkout paths during test setup by only injecting the repo helper path when the helper distribution is not installed, and fail fast with a clear error when neither source is available.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_bindings/tests/conftest.py | 11 ++++++++++-
 cuda_core/tests/conftest.py     | 11 ++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
index d651673cbc..052853e0d3 100644
--- a/cuda_bindings/tests/conftest.py
+++ b/cuda_bindings/tests/conftest.py
@@ -3,13 +3,22 @@
 
 import pathlib
 import sys
+from importlib.metadata import PackageNotFoundError, distribution
 
 import cuda.bindings.driver as cuda
 import pytest
 
 # Import shared test helpers for tests across subprojects.
+# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
 _test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
-if _test_helpers_root.is_dir():
+try:
+    distribution("cuda-python-test-helpers")
+except PackageNotFoundError as exc:
+    if not _test_helpers_root.is_dir():
+        raise RuntimeError(
+            f"cuda-python-test-helpers not installed; expected checkout path {_test_helpers_root}"
+        ) from exc
+
     test_helpers_root = str(_test_helpers_root)
     if test_helpers_root not in sys.path:
         sys.path.insert(0, test_helpers_root)
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index b8bf28b6d4..4e1500b491 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import sys
+from importlib.metadata import PackageNotFoundError, distribution
 
 import pytest
 
@@ -27,8 +28,16 @@
 from cuda.core._utils.cuda_utils import handle_return
 
 # Import shared test helpers for tests across subprojects.
+# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
 _test_helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
-if _test_helpers_root.is_dir():
+try:
+    distribution("cuda-python-test-helpers")
+except PackageNotFoundError as exc:
+    if not _test_helpers_root.is_dir():
+        raise RuntimeError(
+            f"cuda-python-test-helpers not installed; expected checkout path {_test_helpers_root}"
+        ) from exc
+
     test_helpers_root = str(_test_helpers_root)
     if test_helpers_root not in sys.path:
         sys.path.insert(0, test_helpers_root)

From 4c2dc462d0717406c1b58d1dc3936bd6c64032c5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 11:50:47 -0800
Subject: [PATCH 70/79] Remove cuda_core Pixi PYTHONPATH override for test
 helpers.

Keep helper resolution consistent with test bootstrap logic so Pixi runs do not silently force checkout helpers ahead of installed packages.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_core/pixi.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index a49526d405..b1f5894b05 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -138,7 +138,6 @@ cmd = [
     "--override-ini",
     "norecursedirs=\"\"", # include cython tests (ignore by default config)
 ]
-env = { PYTHONPATH = "$PIXI_PROJECT_ROOT/../cuda_python_test_helpers" }
 depends-on = [{ task = "build-cython-tests" }]
 
 [target.win-64.tasks.test]
@@ -148,5 +147,4 @@ cmd = [
     "--override-ini",
     "norecursedirs=\"\"",  # include cython tests (ignore by default config)
 ]
-env = { PYTHONPATH = '%PIXI_PROJECT_ROOT%\..\cuda_python_test_helpers' }
 depends-on = [{ task = "build-cython-tests" }]

From c93251ab6b4277ad57a4d4d209667f8dc2a83b6f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 11:51:05 -0800
Subject: [PATCH 71/79] Remove helper package install from coverage workflow.

Coverage jobs now use the same conftest bootstrap path as other test flows, reducing redundant setup and keeping helper resolution behavior consistent across CI.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/coverage.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 1782263486..4cb355d691 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -98,10 +98,6 @@ jobs:
         run: |
           .venv/bin/pip install -v ./cuda_pathfinder
 
-      - name: Build cuda-python-test-helpers
-        run: |
-          .venv/bin/pip install -v ./cuda_python_test_helpers
-
       - name: Build cuda-bindings
         run: |
           cd cuda_bindings

From 57eb0b0f3522f8a57c523bfc54b4f17ea064937a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 12:04:36 -0800
Subject: [PATCH 72/79] Use per-module pytest plugin registration for NVVM
 helper fixtures.

Keep fixture registration next to the tests that consume it and drop direct fixture imports that required lint suppressions.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_bindings/tests/test_nvvm.py | 3 ++-
 cuda_core/tests/test_program.py  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 81992616ba..05fec9767d 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -7,7 +7,8 @@
 
 import pytest
 from cuda.bindings import nvvm
-from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
+
+pytest_plugins = ("cuda_python_test_helpers.nvvm_bitcode",)
 
 
 @pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 30cdad6b9e..15a2f504a2 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -11,7 +11,8 @@
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
-from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir  # noqa: F401, F811
+
+pytest_plugins = ("cuda_python_test_helpers.nvvm_bitcode",)
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()

From f0eb1a79a1a24c8d9fdce4aef2087d901e0cd545 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 14:04:28 -0800
Subject: [PATCH 73/79] Remove "Renamed" comments

---
 .../cuda/pathfinder/_static_libs/find_bitcode_lib.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index f752290f96..8b9b566592 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -24,13 +24,13 @@ class LocatedBitcodeLib:
     filename: str
 
 
-class _BitcodeLibInfo(TypedDict):  # Renamed: Config -> Info
+class _BitcodeLibInfo(TypedDict):
     filename: str
     rel_path: str
     site_packages_dirs: tuple[str, ...]
 
 
-_SUPPORTED_BITCODE_LIBS_INFO: dict[str, _BitcodeLibInfo] = {  # Renamed: added underscore prefix
+_SUPPORTED_BITCODE_LIBS_INFO: dict[str, _BitcodeLibInfo] = {
     "device": {
         "filename": "libdevice.10.bc",
         "rel_path": os.path.join("nvvm", "libdevice"),

From 7c6e2c9fbcabecf3484576384485e662f774199c Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 15:05:40 -0800
Subject: [PATCH 74/79] Make bitcode locate/find consistently raise not-found
 errors.

Raise BitcodeLibNotFoundError from locate_bitcode_lib instead of returning None, simplify find_bitcode_lib to rely on that behavior, and update bitcode tests to handle expected not-found cases outside all_must_work mode.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../_static_libs/find_bitcode_lib.py          | 29 +++++++++++--------
 .../tests/test_find_bitcode_lib.py            | 24 ++++++++-------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 8b9b566592..a12b6c3245 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -4,7 +4,7 @@
 import functools
 import os
 from dataclasses import dataclass
-from typing import TypedDict
+from typing import NoReturn, TypedDict
 
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
@@ -105,14 +105,19 @@ def try_with_cuda_home(self) -> str | None:
         )
         return None
 
-    def raise_not_found_error(self) -> None:
+    def raise_not_found_error(self) -> NoReturn:
         err = ", ".join(self.error_messages) if self.error_messages else "No search paths available"
         att = "\n".join(self.attachments) if self.attachments else ""
         raise BitcodeLibNotFoundError(f'Failure finding "{self.filename}": {err}\n{att}')
 
 
-def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
-    """Locate a bitcode library by name."""
+def locate_bitcode_lib(name: str) -> LocatedBitcodeLib:
+    """Locate a bitcode library by name.
+
+    Raises:
+        ValueError: If ``name`` is not a supported bitcode library.
+        BitcodeLibNotFoundError: If the bitcode library cannot be found.
+    """
     finder = _FindBitcodeLib(name)
 
     abs_path = finder.try_site_packages()
@@ -122,7 +127,7 @@ def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
         abs_path = finder.try_with_cuda_home()
 
     if abs_path is None:
-        return None
+        finder.raise_not_found_error()
 
     return LocatedBitcodeLib(
         name=name,
@@ -133,10 +138,10 @@ def locate_bitcode_lib(name: str) -> LocatedBitcodeLib | None:
 
 @functools.cache
 def find_bitcode_lib(name: str) -> str:
-    """Find the absolute path to a bitcode library."""
-    result = locate_bitcode_lib(name)
-    if result is None:
-        info = _SUPPORTED_BITCODE_LIBS_INFO.get(name)  # Updated reference
-        filename = info["filename"] if info else name
-        raise BitcodeLibNotFoundError(f"{filename} not found")
-    return result.abs_path
+    """Find the absolute path to a bitcode library.
+
+    Raises:
+        ValueError: If ``name`` is not a supported bitcode library.
+        BitcodeLibNotFoundError: If the bitcode library cannot be found.
+    """
+    return locate_bitcode_lib(name).abs_path
diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 7babcd5b95..206bb56737 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -8,6 +8,7 @@
 import cuda.pathfinder._static_libs.find_bitcode_lib as find_bitcode_lib_module
 from cuda.pathfinder._static_libs.find_bitcode_lib import (
     SUPPORTED_BITCODE_LIBS,
+    BitcodeLibNotFoundError,
     find_bitcode_lib,
     locate_bitcode_lib,
 )
@@ -47,18 +48,21 @@ def _located_bitcode_lib_asserts(located_bitcode_lib):
 
 @pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS)
 def test_locate_bitcode_lib(info_summary_append, libname):
-    lib_path = find_bitcode_lib(libname) if locate_bitcode_lib(libname) else None
-    located_lib = locate_bitcode_lib(libname)
-    assert lib_path is None if not located_lib else lib_path == located_lib.abs_path
+    try:
+        located_lib = locate_bitcode_lib(libname)
+        lib_path = find_bitcode_lib(libname)
+    except BitcodeLibNotFoundError:
+        lib_path = None
+        if STRICTNESS == "all_must_work":
+            raise
+        return
 
     info_summary_append(f"{lib_path=!r}")
-    if lib_path:
-        _located_bitcode_lib_asserts(located_lib)
-        assert os.path.isfile(lib_path)
-        expected_filename = located_lib.filename
-        assert os.path.basename(lib_path) == expected_filename
-    if STRICTNESS == "all_must_work":
-        assert lib_path is not None
+    _located_bitcode_lib_asserts(located_lib)
+    assert os.path.isfile(lib_path)
+    assert lib_path == located_lib.abs_path
+    expected_filename = located_lib.filename
+    assert os.path.basename(lib_path) == expected_filename
 
 
 @pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])

From b77f91bb909311e518496f65f890a5e99b97a291 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 15:42:49 -0800
Subject: [PATCH 75/79] Refocus bitcode-lib tests on real coverage and narrow
 mocks.

Keep the environment-backed locate/find smoke test as the primary happy-path check, and replace broad mock scenarios with targeted branch tests for search order and not-found diagnostics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../tests/test_find_bitcode_lib.py            | 132 ++++++++----------
 1 file changed, 62 insertions(+), 70 deletions(-)

diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 206bb56737..3ac8ac3ae0 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from pathlib import Path
 
 import pytest
 
@@ -13,14 +14,11 @@
     locate_bitcode_lib,
 )
 
-FILENAME = "libdevice.10.bc"
-
-SITE_PACKAGES_REL_DIR_CUDA12 = "nvidia/cuda_nvcc/nvvm/libdevice"
-SITE_PACKAGES_REL_DIR_CUDA13 = "nvidia/cuda_nvvm/nvvm/libdevice"
-
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
 
+BCL_FILENAME = find_bitcode_lib_module._SUPPORTED_BITCODE_LIBS_INFO["device"]["filename"]
+
 
 @pytest.fixture
 def clear_find_bitcode_lib_cache():
@@ -29,12 +27,21 @@ def clear_find_bitcode_lib_cache():
     find_bitcode_lib_module.find_bitcode_lib.cache_clear()
 
 
-def _make_bitcode_lib_file(dir_path: str) -> str:
-    os.makedirs(dir_path, exist_ok=True)
-    file_path = os.path.join(dir_path, FILENAME)
-    with open(file_path, "wb"):
-        pass
-    return file_path
+def _make_bitcode_lib_file(dir_path: Path) -> str:
+    dir_path.mkdir(parents=True, exist_ok=True)
+    file_path = dir_path / BCL_FILENAME
+    file_path.touch()
+    return str(file_path)
+
+
+def _bitcode_lib_dir_under(anchor_dir: Path) -> Path:
+    return anchor_dir / "nvvm" / "libdevice"
+
+
+def _conda_anchor(conda_prefix: Path) -> Path:
+    if find_bitcode_lib_module.IS_WINDOWS:
+        return conda_prefix / "Library"
+    return conda_prefix
 
 
 def _located_bitcode_lib_asserts(located_bitcode_lib):
@@ -46,15 +53,16 @@ def _located_bitcode_lib_asserts(located_bitcode_lib):
     assert os.path.isfile(located_bitcode_lib.abs_path)
 
 
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
 @pytest.mark.parametrize("libname", SUPPORTED_BITCODE_LIBS)
 def test_locate_bitcode_lib(info_summary_append, libname):
     try:
         located_lib = locate_bitcode_lib(libname)
         lib_path = find_bitcode_lib(libname)
     except BitcodeLibNotFoundError:
-        lib_path = None
         if STRICTNESS == "all_must_work":
             raise
+        info_summary_append(f"{libname}: not found")
         return
 
     info_summary_append(f"{lib_path=!r}")
@@ -65,94 +73,78 @@ def test_locate_bitcode_lib(info_summary_append, libname):
     assert os.path.basename(lib_path) == expected_filename
 
 
-@pytest.mark.parametrize("rel_dir", [SITE_PACKAGES_REL_DIR_CUDA12, SITE_PACKAGES_REL_DIR_CUDA13])
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
-def test_find_bitcode_lib_via_site_packages(monkeypatch, mocker, tmp_path, rel_dir):
-    bitcode_lib_dir = tmp_path.joinpath(*rel_dir.split("/"))
-    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
+def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path):
+    site_packages_lib_dir = tmp_path / "site-packages" / "nvidia" / "cu13" / "nvvm" / "libdevice"
+    site_packages_path = _make_bitcode_lib_file(site_packages_lib_dir)
 
-    mocker.patch.object(
-        find_bitcode_lib_module,
-        "find_sub_dirs_all_sitepackages",
-        return_value=[str(bitcode_lib_dir)],
-    )
-    monkeypatch.delenv("CONDA_PREFIX", raising=False)
-    monkeypatch.delenv("CUDA_HOME", raising=False)
-    monkeypatch.delenv("CUDA_PATH", raising=False)
-
-    result = find_bitcode_lib_module.locate_bitcode_lib("device")
+    conda_prefix = tmp_path / "conda-prefix"
+    conda_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(_conda_anchor(conda_prefix)))
 
-    assert result is not None
-    assert result.abs_path == expected_path
-    assert result.name == "device"
-    assert result.filename == FILENAME
-    assert os.path.isfile(result.abs_path)
-
-
-@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
-def test_find_bitcode_lib_via_conda(monkeypatch, mocker, tmp_path):
-    rel_path = os.path.join("nvvm", "libdevice")
-    bitcode_lib_dir = tmp_path / rel_path
-    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
+    cuda_home = tmp_path / "cuda-home"
+    cuda_home_path = _make_bitcode_lib_file(_bitcode_lib_dir_under(cuda_home))
 
-    mocker.patch.object(find_bitcode_lib_module, "IS_WINDOWS", False)
-    mocker.patch.object(
+    monkeypatch.setattr(
         find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[],
+        lambda _sub_dir: [str(site_packages_lib_dir)],
     )
-    monkeypatch.setenv("CONDA_PREFIX", str(tmp_path))
-    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix))
+    monkeypatch.setenv("CUDA_HOME", str(cuda_home))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_bitcode_lib_module.locate_bitcode_lib("device")
+    assert locate_bitcode_lib("device").abs_path == site_packages_path
+    os.remove(site_packages_path)
 
-    assert result is not None
-    assert result.abs_path == expected_path
-    assert os.path.isfile(result.abs_path)
+    assert locate_bitcode_lib("device").abs_path == conda_path
+    os.remove(conda_path)
 
+    assert locate_bitcode_lib("device").abs_path == cuda_home_path
 
-@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
-def test_find_bitcode_lib_via_cuda_home(monkeypatch, mocker, tmp_path):
-    rel_path = os.path.join("nvvm", "libdevice")
-    bitcode_lib_dir = tmp_path / rel_path
-    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
 
-    mocker.patch.object(
+@pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
+def test_find_bitcode_lib_not_found_error_includes_cuda_home_directory_listing(monkeypatch, tmp_path):
+    cuda_home = tmp_path / "cuda-home"
+    lib_dir = _bitcode_lib_dir_under(cuda_home)
+    lib_dir.mkdir(parents=True, exist_ok=True)
+    extra_file = lib_dir / "README.txt"
+    extra_file.write_text("placeholder", encoding="utf-8")
+
+    monkeypatch.setattr(
         find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[],
+        lambda _sub_dir: [],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
-    monkeypatch.setenv("CUDA_HOME", str(tmp_path))
+    monkeypatch.setenv("CUDA_HOME", str(cuda_home))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_bitcode_lib_module.locate_bitcode_lib("device")
+    with pytest.raises(BitcodeLibNotFoundError, match=r'Failure finding "libdevice\.10\.bc"') as exc_info:
+        find_bitcode_lib("device")
 
-    assert result is not None
-    assert result.abs_path == expected_path
-    assert os.path.isfile(result.abs_path)
+    message = str(exc_info.value)
+    expected_missing_file = os.path.join(str(lib_dir), BCL_FILENAME)
+    assert f"No such file: {expected_missing_file}" in message
+    assert f'listdir("{lib_dir}"):' in message
+    assert "README.txt" in message
 
 
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")
-def test_find_bitcode_lib_returns_path(monkeypatch, mocker, tmp_path):
-    rel_path = os.path.join("nvvm", "libdevice")
-    bitcode_lib_dir = tmp_path / rel_path
-    expected_path = str(_make_bitcode_lib_file(str(bitcode_lib_dir)))
-
-    mocker.patch.object(
+def test_find_bitcode_lib_not_found_error_without_cuda_home(monkeypatch):
+    monkeypatch.setattr(
         find_bitcode_lib_module,
         "find_sub_dirs_all_sitepackages",
-        return_value=[str(bitcode_lib_dir)],
+        lambda _sub_dir: [],
     )
     monkeypatch.delenv("CONDA_PREFIX", raising=False)
     monkeypatch.delenv("CUDA_HOME", raising=False)
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    result = find_bitcode_lib_module.find_bitcode_lib("device")
-
-    assert result == expected_path
-    assert isinstance(result, str)
+    with pytest.raises(
+        BitcodeLibNotFoundError,
+        match=r'Failure finding "libdevice\.10\.bc": CUDA_HOME/CUDA_PATH not set',
+    ):
+        find_bitcode_lib("device")
 
 
 def test_find_bitcode_lib_invalid_name():

From e98f32c2bc055b39901de596cce7f9ec9a3a91c4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 15:48:13 -0800
Subject: [PATCH 76/79] Undo accidental change in
 cuda_pathfinder/tests/test_find_nvidia_binaries.py

---
 cuda_pathfinder/tests/test_find_nvidia_binaries.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index c017ecb7d1..4f9eef223a 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -14,9 +14,6 @@
     SUPPORTED_BINARIES_ALL,
 )
 
-STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
-assert STRICTNESS in ("see_what_works", "all_must_work")
-
 
 def test_unknown_utility_name():
     with pytest.raises(UnsupportedBinaryError, match=r"'unknown-utility' is not supported"):

From 296aaf22bd654ceaa96f8a79066b521e5999775b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 15:52:07 -0800
Subject: [PATCH 77/79] Use direct platform_aware.IS_WINDOWS import (rather
 than the detour through supported_nvidia_libs)

---
 .../cuda/pathfinder/_static_libs/find_bitcode_lib.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index a12b6c3245..70b19a899a 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -6,9 +6,9 @@
 from dataclasses import dataclass
 from typing import NoReturn, TypedDict
 
-from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import IS_WINDOWS
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
 
 
 class BitcodeLibNotFoundError(RuntimeError):

From 037b26a90a18d221bd8f755f192ccab610ab5021 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 16:04:08 -0800
Subject: [PATCH 78/79] Add found_via metadata for bitcode library discovery.

Record whether locate_bitcode_lib resolved via site-packages, conda, or CUDA_HOME to mirror pathfinder discovery APIs, and update focused tests to validate the reported source.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../_static_libs/find_bitcode_lib.py          | 41 +++++++++++++------
 .../tests/test_find_bitcode_lib.py            | 14 +++++--
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
index 70b19a899a..109f9eb53f 100644
--- a/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_static_libs/find_bitcode_lib.py
@@ -22,6 +22,7 @@ class LocatedBitcodeLib:
     name: str
     abs_path: str
     filename: str
+    found_via: str
 
 
 class _BitcodeLibInfo(TypedDict):
@@ -121,19 +122,33 @@ def locate_bitcode_lib(name: str) -> LocatedBitcodeLib:
     finder = _FindBitcodeLib(name)
 
     abs_path = finder.try_site_packages()
-    if abs_path is None:
-        abs_path = finder.try_with_conda_prefix()
-    if abs_path is None:
-        abs_path = finder.try_with_cuda_home()
-
-    if abs_path is None:
-        finder.raise_not_found_error()
-
-    return LocatedBitcodeLib(
-        name=name,
-        abs_path=abs_path,
-        filename=finder.filename,
-    )
+    if abs_path is not None:
+        return LocatedBitcodeLib(
+            name=name,
+            abs_path=abs_path,
+            filename=finder.filename,
+            found_via="site-packages",
+        )
+
+    abs_path = finder.try_with_conda_prefix()
+    if abs_path is not None:
+        return LocatedBitcodeLib(
+            name=name,
+            abs_path=abs_path,
+            filename=finder.filename,
+            found_via="conda",
+        )
+
+    abs_path = finder.try_with_cuda_home()
+    if abs_path is not None:
+        return LocatedBitcodeLib(
+            name=name,
+            abs_path=abs_path,
+            filename=finder.filename,
+            found_via="CUDA_HOME",
+        )
+
+    finder.raise_not_found_error()
 
 
 @functools.cache
diff --git a/cuda_pathfinder/tests/test_find_bitcode_lib.py b/cuda_pathfinder/tests/test_find_bitcode_lib.py
index 3ac8ac3ae0..2da138c31d 100644
--- a/cuda_pathfinder/tests/test_find_bitcode_lib.py
+++ b/cuda_pathfinder/tests/test_find_bitcode_lib.py
@@ -50,6 +50,8 @@ def _located_bitcode_lib_asserts(located_bitcode_lib):
     assert isinstance(located_bitcode_lib.name, str)
     assert isinstance(located_bitcode_lib.abs_path, str)
     assert isinstance(located_bitcode_lib.filename, str)
+    assert isinstance(located_bitcode_lib.found_via, str)
+    assert located_bitcode_lib.found_via in ("site-packages", "conda", "CUDA_HOME")
     assert os.path.isfile(located_bitcode_lib.abs_path)
 
 
@@ -93,13 +95,19 @@ def test_locate_bitcode_lib_search_order(monkeypatch, tmp_path):
     monkeypatch.setenv("CUDA_HOME", str(cuda_home))
     monkeypatch.delenv("CUDA_PATH", raising=False)
 
-    assert locate_bitcode_lib("device").abs_path == site_packages_path
+    located_lib = locate_bitcode_lib("device")
+    assert located_lib.abs_path == site_packages_path
+    assert located_lib.found_via == "site-packages"
     os.remove(site_packages_path)
 
-    assert locate_bitcode_lib("device").abs_path == conda_path
+    located_lib = locate_bitcode_lib("device")
+    assert located_lib.abs_path == conda_path
+    assert located_lib.found_via == "conda"
     os.remove(conda_path)
 
-    assert locate_bitcode_lib("device").abs_path == cuda_home_path
+    located_lib = locate_bitcode_lib("device")
+    assert located_lib.abs_path == cuda_home_path
+    assert located_lib.found_via == "CUDA_HOME"
 
 
 @pytest.mark.usefixtures("clear_find_bitcode_lib_cache")

From 3f73de4dfab4d3bd90819eac6cdeeff0311ec475 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 17 Feb 2026 16:12:20 -0800
Subject: [PATCH 79/79] Export SUPPORTED_BITCODE_LIBS in cuda.pathfinder public
 API.

Expose supported bitcode names through __init__ using the existing Sphinx indirection pattern so users can discover the API surface consistently with other supported-* constants.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_pathfinder/cuda/pathfinder/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 29a3ff4979..a0ba8e4f21 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -19,6 +19,9 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
+from cuda.pathfinder._static_libs.find_bitcode_lib import (
+    SUPPORTED_BITCODE_LIBS as _SUPPORTED_BITCODE_LIBS,
+)
 from cuda.pathfinder._static_libs.find_bitcode_lib import (
     BitcodeLibNotFoundError as BitcodeLibNotFoundError,
 )
@@ -47,6 +50,11 @@
 #: Example utilities: ``"nvdisasm"``, ``"cuobjdump"``, ``"nvcc"``.
 SUPPORTED_BINARY_UTILITIES = _SUPPORTED_BINARIES
 
+#: Tuple of supported bitcode library names that can be resolved
+#: via ``locate_bitcode_lib()`` and ``find_bitcode_lib()``.
+#: Example value: ``"device"``.
+SUPPORTED_BITCODE_LIBS = _SUPPORTED_BITCODE_LIBS
+
 # Backward compatibility: _find_nvidia_header_directory was added in release 1.2.2.
 # It will be removed in release 1.2.4.
 _find_nvidia_header_directory = find_nvidia_header_directory