Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/fsnotify/fsnotify v1.9.0
github.com/onsi/ginkgo/v2 v2.28.1
github.com/onsi/gomega v1.39.1
golang.org/x/sys v0.40.0
google.golang.org/grpc v1.79.2
k8s.io/klog/v2 v2.140.0
k8s.io/kubelet v0.33.5
Expand All @@ -24,7 +25,6 @@ require (
golang.org/x/mod v0.32.0 // indirect
golang.org/x/net v0.49.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.40.0 // indirect
golang.org/x/text v0.33.0 // indirect
golang.org/x/tools v0.41.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
Expand Down
14 changes: 13 additions & 1 deletion pkg/device_plugin/device_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,21 @@ func createDevicePlugins() {
for k, gpuDevices := range deviceMap {
devs = nil
for _, gpuDev := range gpuDevices {
// Mark devices whose /dev/vfio/<group> is currently held by
// another process as Unhealthy. Including them in the pool
// preserves the kubelet's capacity accounting and the existing
// allocations of running pods that already hold them; the
// Unhealthy flag keeps the kubelet from handing the same PCI
// ID to a new pod, which would otherwise fail with
// "Could not open '/dev/vfio/<group>': Device or resource busy".
health := pluginapi.Healthy
if iommuGroup, ok := bdfToIommuMap[gpuDev.addr]; ok && isVfioGroupBusy(iommuGroup) {
health = pluginapi.Unhealthy
log.Printf("Marking %s Unhealthy: /dev/vfio/%s is held by another process (already in use)", gpuDev.addr, iommuGroup)
}
device := &pluginapi.Device{
ID: gpuDev.addr,
Health: pluginapi.Healthy,
Health: health,
Topology: &pluginapi.TopologyInfo{
Nodes: []*pluginapi.NUMANode{
{ID: gpuDev.numaNode},
Expand Down
71 changes: 71 additions & 0 deletions pkg/device_plugin/vfio_busy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package device_plugin

import (
"errors"
"path/filepath"

"golang.org/x/sys/unix"
)

// isVfioGroupBusy reports whether /dev/vfio/<group> is currently held by
// another process (typically a running qemu-kvm / virt-launcher with a GPU
// passed through). The VFIO kernel driver returns EBUSY on the second
// open(2) of a group whose `opened` refcount is already 1, so a single
// non-blocking open is enough to distinguish "free" from "in use".
//
// Returning true tells the device plugin to exclude this device from the
// pool it advertises to kubelet, so a new pod is never assigned a PCI
// device that is already passed through to another tenant VM — preventing
// the "Could not open '/dev/vfio/<group>': Device or resource busy" qemu
// crashloop.
//
// Overridable for tests.
var isVfioGroupBusy = isVfioGroupBusyFunc

// vfioGroupBasePath is /dev/vfio. Overridable for tests.
var vfioGroupBasePath = "/dev/vfio"

func isVfioGroupBusyFunc(group string) bool {
path := filepath.Join(vfioGroupBasePath, group)
fd, err := unix.Open(path, unix.O_RDWR|unix.O_NONBLOCK|unix.O_CLOEXEC, 0)
if err != nil {
if errors.Is(err, unix.EBUSY) {
return true
}
// Any other error (ENOENT, EACCES, ...) means we cannot prove the
// group is in use. Be conservative and report it as not-busy so a
// missing /dev node or permission glitch does not silently shrink
// the advertised pool.
return false
}
_ = unix.Close(fd)
return false
}
145 changes: 145 additions & 0 deletions pkg/device_plugin/vfio_busy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/*
* Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package device_plugin

import (
"os"
"path/filepath"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

var _ = Describe("vfio-busy", func() {

Describe("isVfioGroupBusyFunc", func() {
var (
tmpDir string
origBasePath string
err error
)

BeforeEach(func() {
tmpDir, err = os.MkdirTemp("", "vfio-busy-test")
Expect(err).ToNot(HaveOccurred())
origBasePath = vfioGroupBasePath
vfioGroupBasePath = tmpDir
})

AfterEach(func() {
vfioGroupBasePath = origBasePath
os.RemoveAll(tmpDir)
})

It("reports not busy when /dev/vfio/<group> is absent", func() {
// A transient ENOENT (missing /dev node or wrong mount) must
// not silently shrink the advertised pool.
Expect(isVfioGroupBusyFunc("does-not-exist")).To(BeFalse())
})

It("reports not busy when the group node opens cleanly", func() {
// A regular file stands in for a free vfio group node — it opens
// without EBUSY, so the helper must report not busy.
groupPath := filepath.Join(tmpDir, "42")
Expect(os.WriteFile(groupPath, nil, 0644)).To(Succeed())
Expect(isVfioGroupBusyFunc("42")).To(BeFalse())
})
})

Describe("createDevicePlugins marks busy-group GPUs Unhealthy", func() {
var (
origIsBusy func(string) bool
origDeviceMap map[string][]NvidiaGpuDevice
origBdfToIommu map[string]string
origStart func(*GenericDevicePlugin) error
started []*GenericDevicePlugin
)

BeforeEach(func() {
origDeviceMap = deviceMap
origBdfToIommu = bdfToIommuMap

// Two NVIDIA GPUs of the same device id sharing a single plugin:
// 0000:23:00.0 → group 51 (busy, held by another tenant VM)
// 0000:24:00.0 → group 73 (free)
deviceMap = map[string][]NvidiaGpuDevice{
"2684": {
{addr: "0000:23:00.0", numaNode: 0},
{addr: "0000:24:00.0", numaNode: 0},
},
}
bdfToIommuMap = map[string]string{
"0000:23:00.0": "51",
"0000:24:00.0": "73",
}

origIsBusy = isVfioGroupBusy
isVfioGroupBusy = func(group string) bool { return group == "51" }

origStart = startDevicePlugin
started = nil
startDevicePlugin = func(dp *GenericDevicePlugin) error {
started = append(started, dp)
return nil
}
})

AfterEach(func() {
deviceMap = origDeviceMap
bdfToIommuMap = origBdfToIommu
isVfioGroupBusy = origIsBusy
startDevicePlugin = origStart
})

It("keeps busy GPUs in the pool but marks them Unhealthy", func() {
// createDevicePlugins blocks on the package stop channel after
// constructing the plugins. Run it in a goroutine, give it a
// moment to populate `started`, then unblock it.
go createDevicePlugins()
Eventually(func() int { return len(started) }, "2s", "20ms").
Should(Equal(1), "a single plugin is created per device id (here 2684)")
stop <- struct{}{}

devsByID := map[string]string{}
for _, d := range started[0].devs {
devsByID[d.ID] = d.Health
}

Expect(devsByID).To(HaveKey("0000:23:00.0"),
"a busy GPU must stay in the advertised pool so kubelet's capacity accounting is correct and the existing-pod allocation is preserved")
Expect(devsByID["0000:23:00.0"]).To(Equal(pluginapi.Unhealthy),
"a busy GPU must be Unhealthy so kubelet does not hand the same PCI ID to a new pod — which would crashloop in qemu with /dev/vfio/<group> EBUSY")

Expect(devsByID).To(HaveKeyWithValue("0000:24:00.0", pluginapi.Healthy),
"a free GPU must remain Healthy and available for allocation")
})
})
})