diff --git a/go.mod b/go.mod index a09b4a8b..9d34a899 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.9.0 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 + golang.org/x/sys v0.40.0 google.golang.org/grpc v1.79.2 k8s.io/klog/v2 v2.140.0 k8s.io/kubelet v0.33.5 @@ -24,7 +25,6 @@ require ( golang.org/x/mod v0.32.0 // indirect golang.org/x/net v0.49.0 // indirect golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.40.0 // indirect golang.org/x/text v0.33.0 // indirect golang.org/x/tools v0.41.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect diff --git a/pkg/device_plugin/device_plugin.go b/pkg/device_plugin/device_plugin.go index 1a89a5c0..c9163f1b 100644 --- a/pkg/device_plugin/device_plugin.go +++ b/pkg/device_plugin/device_plugin.go @@ -112,9 +112,21 @@ func createDevicePlugins() { for k, gpuDevices := range deviceMap { devs = nil for _, gpuDev := range gpuDevices { + // Mark devices whose /dev/vfio/ is currently held by + // another process as Unhealthy. Including them in the pool + // preserves the kubelet's capacity accounting and the existing + // allocations of running pods that already hold them; the + // Unhealthy flag keeps the kubelet from handing the same PCI + // ID to a new pod, which would otherwise fail with + // "Could not open '/dev/vfio/': Device or resource busy". + health := pluginapi.Healthy + if iommuGroup, ok := bdfToIommuMap[gpuDev.addr]; ok && isVfioGroupBusy(iommuGroup) { + health = pluginapi.Unhealthy + log.Printf("Marking %s Unhealthy: /dev/vfio/%s is held by another process (already in use)", gpuDev.addr, iommuGroup) + } device := &pluginapi.Device{ ID: gpuDev.addr, - Health: pluginapi.Healthy, + Health: health, Topology: &pluginapi.TopologyInfo{ Nodes: []*pluginapi.NUMANode{ {ID: gpuDev.numaNode}, diff --git a/pkg/device_plugin/vfio_busy.go b/pkg/device_plugin/vfio_busy.go new file mode 100644 index 00000000..29f15e6c --- /dev/null +++ b/pkg/device_plugin/vfio_busy.go @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package device_plugin + +import ( + "errors" + "path/filepath" + + "golang.org/x/sys/unix" +) + +// isVfioGroupBusy reports whether /dev/vfio/ is currently held by +// another process (typically a running qemu-kvm / virt-launcher with a GPU +// passed through). The VFIO kernel driver returns EBUSY on the second +// open(2) of a group whose `opened` refcount is already 1, so a single +// non-blocking open is enough to distinguish "free" from "in use". +// +// Returning true tells the device plugin to exclude this device from the +// pool it advertises to kubelet, so a new pod is never assigned a PCI +// device that is already passed through to another tenant VM — preventing +// the "Could not open '/dev/vfio/': Device or resource busy" qemu +// crashloop. +// +// Overridable for tests. +var isVfioGroupBusy = isVfioGroupBusyFunc + +// vfioGroupBasePath is /dev/vfio. Overridable for tests. +var vfioGroupBasePath = "/dev/vfio" + +func isVfioGroupBusyFunc(group string) bool { + path := filepath.Join(vfioGroupBasePath, group) + fd, err := unix.Open(path, unix.O_RDWR|unix.O_NONBLOCK|unix.O_CLOEXEC, 0) + if err != nil { + if errors.Is(err, unix.EBUSY) { + return true + } + // Any other error (ENOENT, EACCES, ...) means we cannot prove the + // group is in use. Be conservative and report it as not-busy so a + // missing /dev node or permission glitch does not silently shrink + // the advertised pool. + return false + } + _ = unix.Close(fd) + return false +} diff --git a/pkg/device_plugin/vfio_busy_test.go b/pkg/device_plugin/vfio_busy_test.go new file mode 100644 index 00000000..42239a26 --- /dev/null +++ b/pkg/device_plugin/vfio_busy_test.go @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package device_plugin + +import ( + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +var _ = Describe("vfio-busy", func() { + + Describe("isVfioGroupBusyFunc", func() { + var ( + tmpDir string + origBasePath string + err error + ) + + BeforeEach(func() { + tmpDir, err = os.MkdirTemp("", "vfio-busy-test") + Expect(err).ToNot(HaveOccurred()) + origBasePath = vfioGroupBasePath + vfioGroupBasePath = tmpDir + }) + + AfterEach(func() { + vfioGroupBasePath = origBasePath + os.RemoveAll(tmpDir) + }) + + It("reports not busy when /dev/vfio/ is absent", func() { + // A transient ENOENT (missing /dev node or wrong mount) must + // not silently shrink the advertised pool. + Expect(isVfioGroupBusyFunc("does-not-exist")).To(BeFalse()) + }) + + It("reports not busy when the group node opens cleanly", func() { + // A regular file stands in for a free vfio group node — it opens + // without EBUSY, so the helper must report not busy. + groupPath := filepath.Join(tmpDir, "42") + Expect(os.WriteFile(groupPath, nil, 0644)).To(Succeed()) + Expect(isVfioGroupBusyFunc("42")).To(BeFalse()) + }) + }) + + Describe("createDevicePlugins marks busy-group GPUs Unhealthy", func() { + var ( + origIsBusy func(string) bool + origDeviceMap map[string][]NvidiaGpuDevice + origBdfToIommu map[string]string + origStart func(*GenericDevicePlugin) error + started []*GenericDevicePlugin + ) + + BeforeEach(func() { + origDeviceMap = deviceMap + origBdfToIommu = bdfToIommuMap + + // Two NVIDIA GPUs of the same device id sharing a single plugin: + // 0000:23:00.0 → group 51 (busy, held by another tenant VM) + // 0000:24:00.0 → group 73 (free) + deviceMap = map[string][]NvidiaGpuDevice{ + "2684": { + {addr: "0000:23:00.0", numaNode: 0}, + {addr: "0000:24:00.0", numaNode: 0}, + }, + } + bdfToIommuMap = map[string]string{ + "0000:23:00.0": "51", + "0000:24:00.0": "73", + } + + origIsBusy = isVfioGroupBusy + isVfioGroupBusy = func(group string) bool { return group == "51" } + + origStart = startDevicePlugin + started = nil + startDevicePlugin = func(dp *GenericDevicePlugin) error { + started = append(started, dp) + return nil + } + }) + + AfterEach(func() { + deviceMap = origDeviceMap + bdfToIommuMap = origBdfToIommu + isVfioGroupBusy = origIsBusy + startDevicePlugin = origStart + }) + + It("keeps busy GPUs in the pool but marks them Unhealthy", func() { + // createDevicePlugins blocks on the package stop channel after + // constructing the plugins. Run it in a goroutine, give it a + // moment to populate `started`, then unblock it. + go createDevicePlugins() + Eventually(func() int { return len(started) }, "2s", "20ms"). + Should(Equal(1), "a single plugin is created per device id (here 2684)") + stop <- struct{}{} + + devsByID := map[string]string{} + for _, d := range started[0].devs { + devsByID[d.ID] = d.Health + } + + Expect(devsByID).To(HaveKey("0000:23:00.0"), + "a busy GPU must stay in the advertised pool so kubelet's capacity accounting is correct and the existing-pod allocation is preserved") + Expect(devsByID["0000:23:00.0"]).To(Equal(pluginapi.Unhealthy), + "a busy GPU must be Unhealthy so kubelet does not hand the same PCI ID to a new pod — which would crashloop in qemu with /dev/vfio/ EBUSY") + + Expect(devsByID).To(HaveKeyWithValue("0000:24:00.0", pluginapi.Healthy), + "a free GPU must remain Healthy and available for allocation") + }) + }) +})