NVIDIA · leofang · Sep 17, 2025 · Aug 26, 2025 · Aug 27, 2025 · Sep 1, 2025
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
@@ -11,7 +11,7 @@
 from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.clear_error_support import (
     assert_type,
-    assert_type_str_or_bytes,
+    assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
 from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
@@ -615,14 +615,14 @@ def _lazy_load_module(self, *args, **kwargs):
         if self._handle is not None:
             return
         module = self._module
-        assert_type_str_or_bytes(module)
+        assert_type_str_or_bytes_like(module)
         if isinstance(module, str):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             else:  # "old" backend
                 self._handle = handle_return(self._loader["file"](module.encode()))
             return
-        if isinstance(module, bytes):
+        if isinstance(module, (bytes, bytearray)):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             else:  # "old" backend

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import weakref
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Union
 from warnings import warn
@@ -20,13 +21,87 @@
     _handle_boolean_option,
     check_or_create_options,
     driver,
+    get_binding_version,
     handle_return,
     is_nested_sequence,
     is_sequence,
     nvrtc,
 )
 
 
+@contextmanager
+def _nvvm_exception_manager(self):
+    """
+    Taken from _linker.py
+    """
+    try:
+        yield
+    except Exception as e:
+        error_log = ""
+        if hasattr(self, "_mnff"):
+            try:
+                nvvm = _get_nvvm_module()
+                logsize = nvvm.get_program_log_size(self._mnff.handle)
+                if logsize > 1:
+                    log = bytearray(logsize)
+                    nvvm.get_program_log(self._mnff.handle, log)
+                    error_log = log.decode("utf-8", errors="backslashreplace")
+            except Exception:
+                error_log = ""
+        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
+        # unfortunately we are still supporting Python 3.9/3.10...
+        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
+        raise e
+
+
+_nvvm_module = None
+_nvvm_import_attempted = False
+
+
+def _get_nvvm_module():
+    """
+    Handles the import of NVVM module with version and availability checks.
+    NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where:
+    1. cuda.bindings is not new enough (< 12.9.0)
+    2. libnvvm is not found in the Python environment
+
+    Returns:
+        The nvvm module if available and working
+
+    Raises:
+        RuntimeError: If NVVM is not available due to version or library issues
+    """
+    global _nvvm_module, _nvvm_import_attempted
+
+    if _nvvm_import_attempted:
+        if _nvvm_module is None:
+            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
+        return _nvvm_module
+
+    _nvvm_import_attempted = True
+
+    try:
+        version = get_binding_version()
+        if version < (12, 9):
+            raise RuntimeError(
+                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
+                "Please update cuda-bindings to use NVVM features."
+            )
+
+        from cuda.bindings import nvvm
+        from cuda.bindings._internal.nvvm import _inspect_function_pointer
+
+        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
+            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
+
+        _nvvm_module = nvvm
+        return _nvvm_module
+
+    except RuntimeError as e:
+        _nvvm_module = None
+        raise e
+
+
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
         formatted_options.append(f"--define-macro={macro}")
@@ -229,11 +304,10 @@ def __post_init__(self):
 
         self._formatted_options = []
         if self.arch is not None:
-            self._formatted_options.append(f"--gpu-architecture={self.arch}")
+            self._formatted_options.append(f"-arch={self.arch}")
         else:
-            self._formatted_options.append(
-                "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability)
-            )
+            self.arch = f"sm_{Device().arch}"
+            self._formatted_options.append(f"-arch={self.arch}")
         if self.relocatable_device_code is not None:
             self._formatted_options.append(
                 f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}"
@@ -370,28 +444,33 @@ class Program:
     code : Any
         String of the CUDA Runtime Compilation program.
     code_type : Any
-        String of the code type. Currently ``"ptx"`` and ``"c++"`` are supported.
+        String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported.
     options : ProgramOptions, optional
         A ProgramOptions object to customize the compilation process.
         See :obj:`ProgramOptions` for more information.
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = "handle"
+        __slots__ = "handle", "backend"
 
-        def __init__(self, program_obj, handle):
+        def __init__(self, program_obj, handle, backend):
             self.handle = handle
+            self.backend = backend
             weakref.finalize(program_obj, self.close)
 
         def close(self):
             if self.handle is not None:
-                handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+                if self.backend == "NVRTC":
+                    handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+                elif self.backend == "NVVM":
+                    nvvm = _get_nvvm_module()
+                    nvvm.destroy_program(self.handle)
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = Program._MembersNeededForFinalize(self, None)
+        self._mnff = Program._MembersNeededForFinalize(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
@@ -402,6 +481,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
+            self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"
             self._linker = None
 
@@ -411,8 +491,22 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = self._linker.backend
+
+        elif code_type == "nvvm":
+            if isinstance(code, str):
+                code = code.encode("utf-8")
+            elif not isinstance(code, (bytes, bytearray)):
+                raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+
+            nvvm = _get_nvvm_module()
+            self._mnff.handle = nvvm.create_program()
+            self._mnff.backend = "NVVM"
+            nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            self._backend = "NVVM"
+            self._linker = None
+
         else:
-            supported_code_types = ("c++", "ptx")
+            supported_code_types = ("c++", "ptx", "nvvm")
             assert code_type not in supported_code_types, f"{code_type=}"
             raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
 
@@ -433,6 +527,33 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
             ptxas_options=options.ptxas_options,
         )
 
+    def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> list[str]:
+        """Translate ProgramOptions to NVVM-specific compilation options."""
+        nvvm_options = []
+
+        assert options.arch is not None
+        arch = options.arch
+        if arch.startswith("sm_"):
+            arch = f"compute_{arch[3:]}"
+        nvvm_options.append(f"-arch={arch}")
+        if options.debug:
+            nvvm_options.append("-g")
+        if options.device_code_optimize is False:
+            nvvm_options.append("-opt=0")
+        elif options.device_code_optimize is True:
+            nvvm_options.append("-opt=3")
+        # NVVM is not consistent with NVRTC, it uses 0/1 instead...
+        if options.ftz is not None:
+            nvvm_options.append(f"-ftz={'1' if options.ftz else '0'}")
+        if options.prec_sqrt is not None:
+            nvvm_options.append(f"-prec-sqrt={'1' if options.prec_sqrt else '0'}")
+        if options.prec_div is not None:
+            nvvm_options.append(f"-prec-div={'1' if options.prec_div else '0'}")
+        if options.fma is not None:
+            nvvm_options.append(f"-fma={'1' if options.fma else '0'}")
+
+        return nvvm_options
+
     def close(self):
         """Destroy this program."""
         if self._linker:
@@ -513,6 +634,31 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
 
+        elif self._backend == "NVVM":
+            if target_type not in ("ptx", "ltoir"):
+                raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
+
+            nvvm_options = self._translate_program_options_to_nvvm(self._options)
+            if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
+                nvvm_options.append("-gen-lto")
+            nvvm = _get_nvvm_module()
+            with _nvvm_exception_manager(self):
+                nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+                nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+
+            size = nvvm.get_compiled_result_size(self._mnff.handle)
+            data = bytearray(size)
+            nvvm.get_compiled_result(self._mnff.handle, data)
+
+            if logs is not None:
+                logsize = nvvm.get_program_log_size(self._mnff.handle)
+                if logsize > 1:
+                    log = bytearray(logsize)
+                    nvvm.get_program_log(self._mnff.handle, log)
+                    logs.write(log.decode("utf-8", errors="backslashreplace"))
+
+            return ObjectCode._init(data, target_type, name=self._options.name)
+
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
             raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')

diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
@@ -9,10 +9,10 @@ def assert_type(obj, expected_type):
         raise TypeError(f"Expected type {expected_type.__name__}, but got {type(obj).__name__}")
 
 
-def assert_type_str_or_bytes(obj):
+def assert_type_str_or_bytes_like(obj):
     """Ensure obj is of type str or bytes, else raise AssertionError with a clear message."""
-    if not isinstance(obj, (str, bytes)):
-        raise TypeError(f"Expected type str or bytes, but got {type(obj).__name__}")
+    if not isinstance(obj, (str, bytes, bytearray)):
+        raise TypeError(f"Expected type str or bytes or bytearray, but got {type(obj).__name__}")
 
 
 def raise_code_path_meant_to_be_unreachable():

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -29,6 +29,7 @@ New features
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
 - Stream-ordered memory allocation can now be shared on Linux via :class:`DeviceMemoryResource`.
+- Added NVVM IR support to :class:`Program`. NVVM IR is now understood with ``code_type="nvvm"``.
 
 
 New examples