From f658a42eab7c0536f7f8e65600d72ea56e6c24ac Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 2 Jan 2025 13:05:28 -0800
Subject: [PATCH 1/2] [ET-VK][ez] Fix undefined behaviour in ambiguous
 `ParamsBuffer` constructor

## Context

I discovered this bug when trying to execute the `vulkan_compute_api_test` binary on Windows. Almost all the tests were failing, with compute shaders producing incorrect results. After bisecting the change, it turns out the culprit is https://github.com/pytorch/executorch/pull/7015. The diff introduced an alternative templated constructor for `ParamsBuffer` which would initialize an empty UBO with a specified size instead of wrapping a pre-existing object.

The issue is that these constructors are ambiguous because they both are template constructors and both only accept one argument. Therefore, the original constructor would be called when certain callsites intended to call the new constructor. This results in a UBO being created with an incorrect size, and resulted in the tensor's metadata being passed incorrectly into a compute shader.

To fix, I added a dummy argument into the new constructor for disambiguation purposes. I also changed it so that it's not templated, since there's no reason for it to be templated.

Differential Revision: [D67770791](https://our.internmc.facebook.com/intern/diff/D67770791/)

ghstack-source-id: 260031108
Pull Request resolved: https://github.com/pytorch/executorch/pull/7478
---
 backends/vulkan/runtime/api/containers/ParamsBuffer.h | 5 +++--
 backends/vulkan/runtime/api/containers/Tensor.cpp     | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
index fe157c5e014..ecc07892cf7 100644
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h
+++ b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
@@ -31,8 +31,9 @@ class ParamsBuffer final {
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
-  template <typename Block>
-  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
+  // The last bool argument, though unused, is required to disambiguate this
+  // constructor from the one above.
+  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused)
       : context_p_(context_p),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 21b0ee4b176..92e310d36de 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -659,7 +659,7 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -674,7 +674,7 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -691,7 +691,7 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -707,7 +707,7 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(

From bd59382936997c9fd0ab594deaed9d95513c6789 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 2 Jan 2025 13:05:30 -0800
Subject: [PATCH 2/2] [ET-VK] Create Pipeline layouts with push constant ranges
 when required

## Context

https://github.com/pytorch/executorch/pull/7223 added the ability to use push constants in shaders. However, one thing the diff missed was not specifying that the compute pipeline layout needed to include a push constant upon creation. The Vulkan validation layers warns against this, and on certain GPUs such as the integrated Intel GPU on my windows laptop compute shaders will produce incorrect output.

This diff makes the change such that the compute pipeline layout will be created with a push constant block if necessary.

## Solution

Change the key of the pipeline layout cache to accept an additional push constant size field. The push constant size will be used to create the pipeline layout with a push constant block of the specified size.

Differential Revision: [D67770793](https://our.internmc.facebook.com/intern/diff/D67770793/)

ghstack-source-id: 260031109
Pull Request resolved: https://github.com/pytorch/executorch/pull/7479
---
 backends/vulkan/runtime/api/Context.cpp       |  9 +++---
 backends/vulkan/runtime/api/Context.h         |  9 ++++--
 .../vulkan/runtime/graph/ops/DispatchNode.cpp | 23 +++++++-------
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  |  4 +--
 backends/vulkan/runtime/vk_api/Pipeline.cpp   | 31 +++++++++++++++----
 backends/vulkan/runtime/vk_api/Pipeline.h     | 16 ++++++----
 6 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 5426ea4e60b..9517941f364 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -90,12 +90,13 @@ void Context::report_shader_dispatch_end() {
 vkapi::DescriptorSet Context::get_descriptor_set(
     const vkapi::ShaderInfo& shader_descriptor,
     const utils::uvec3& local_workgroup_size,
-    const vkapi::SpecVarList& additional_constants) {
+    const vkapi::SpecVarList& additional_constants,
+    const uint32_t push_constants_size) {
   VkDescriptorSetLayout shader_layout =
       shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
 
   VkPipelineLayout pipeline_layout =
-      pipeline_layout_cache().retrieve(shader_layout);
+      pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
   vkapi::SpecVarList spec_constants = {
       SV(local_workgroup_size[0u]),
@@ -105,7 +106,7 @@ vkapi::DescriptorSet Context::get_descriptor_set(
   spec_constants.append(additional_constants);
 
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout_cache().retrieve(shader_layout),
+      {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
        spec_constants});
 
@@ -151,7 +152,7 @@ void Context::register_shader_dispatch(
     const VkDescriptorSetLayout shader_layout =
         shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
     const VkPipelineLayout pipeline_layout =
-        pipeline_layout_cache().retrieve(shader_layout);
+        pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
     cmd_.set_push_constants(
         pipeline_layout, push_constants_data, push_constants_size);
   }
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 65f3adb511d..300fd3995dd 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -188,12 +188,13 @@ class Context final {
   vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo&,
       const utils::uvec3&,
-      const vkapi::SpecVarList&);
+      const vkapi::SpecVarList&,
+      const uint32_t push_constants_size);
 
   inline vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo& shader_descriptor,
       const utils::uvec3& local_work_group_size) {
-    return get_descriptor_set(shader_descriptor, local_work_group_size, {});
+    return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
   }
 
   void register_shader_dispatch(
@@ -333,8 +334,10 @@ inline bool Context::submit_compute_job(
       dispatch_id);
 
   // Factor out template parameter independent code to minimize code bloat.
+  // Note that push constants are not exposed yet via this API, therefore the
+  // push constants size is assumed to be 0.
   vkapi::DescriptorSet descriptor_set = get_descriptor_set(
-      shader, local_work_group_size, specialization_constants);
+      shader, local_work_group_size, specialization_constants, 0u);
 
   detail::bind(
       descriptor_set,
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index 87b4b5b5480..a163a0d7aea 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -60,14 +60,24 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
+  uint32_t push_constants_offset = 0;
+
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset += push_constant.write(
+        push_constants_data.data(),
+        push_constants_offset,
+        kMaxPushConstantSize);
+  }
+
   context->report_shader_dispatch_start(
       shader_.kernel_name,
       global_workgroup_size_,
       local_workgroup_size_,
       node_id_);
 
-  vkapi::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+  vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
@@ -75,15 +85,6 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
   context->register_shader_dispatch(
       descriptor_set,
       pipeline_barrier,
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index 89719fb0dd3..e27723468ab 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -75,8 +75,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   {
     vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set =
-        context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+        shader_, local_workgroup_size_, spec_vars_, 0u);
 
     uint32_t idx = 0;
     bind_tensor_to_descriptor_set(
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 49bbf083359..3856d406c24 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -205,17 +205,29 @@ bool operator==(const SpecVarList& lhs, const SpecVarList& rhs) {
 
 PipelineLayout::PipelineLayout(
     VkDevice device,
-    VkDescriptorSetLayout descriptor_layout)
+    VkDescriptorSetLayout descriptor_layout,
+    const uint32_t push_constants_size)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  // TODO: Enable push constants
+  VkPushConstantRange pc_range{
+      VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags
+      0u, // offset
+      push_constants_size, // size
+  };
+  uint32_t num_push_constants = 0u;
+  VkPushConstantRange* pc_ranges_ptr = nullptr;
+  if (push_constants_size > 0u) {
+    num_push_constants = 1u;
+    pc_ranges_ptr = &pc_range;
+  }
+
   const VkPipelineLayoutCreateInfo pipeline_layout_create_info{
       VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
       1u, // setLayoutCount
       &descriptor_layout, // pSetLayouts
-      0u, // pushConstantRangeCount
-      nullptr, // pPushConstantRanges
+      num_push_constants, // pushConstantRangeCount
+      pc_ranges_ptr, // pPushConstantRanges
   };
 
   VK_CHECK(vkCreatePipelineLayout(
@@ -344,12 +356,19 @@ PipelineLayoutCache::~PipelineLayoutCache() {
 }
 
 VkPipelineLayout PipelineLayoutCache::retrieve(
-    const PipelineLayoutCache::Key& key) {
+    const VkDescriptorSetLayout layout,
+    const uint32_t push_constants_size) {
+  PipelineLayoutCache::Key key{layout, push_constants_size};
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
   if (cache_.cend() == it) {
-    it = cache_.insert({key, PipelineLayoutCache::Value(device_, key)}).first;
+    it = cache_
+             .insert(
+                 {key,
+                  PipelineLayoutCache::Value(
+                      device_, layout, push_constants_size)})
+             .first;
   }
 
   return it->second.handle();
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 4f42a9bf6bb..5460a0acba7 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -121,7 +121,7 @@ VkImageLayout vk_layout(const PipelineStageFlags, const MemoryAccessFlags);
 
 class PipelineLayout final {
  public:
-  explicit PipelineLayout(VkDevice, VkDescriptorSetLayout);
+  explicit PipelineLayout(VkDevice, VkDescriptorSetLayout, const uint32_t);
 
   PipelineLayout(const PipelineLayout&) = delete;
   PipelineLayout& operator=(const PipelineLayout&) = delete;
@@ -193,13 +193,17 @@ class PipelineLayoutCache final {
   PipelineLayoutCache& operator=(PipelineLayoutCache&&) = delete;
 
   ~PipelineLayoutCache();
-
-  using Key = VkDescriptorSetLayout;
+  using Key = std::pair<VkDescriptorSetLayout, uint32_t>;
   using Value = PipelineLayout;
 
   struct Hasher {
-    inline size_t operator()(VkDescriptorSetLayout descriptor_layout) const {
-      return std::hash<VkDescriptorSetLayout>()(descriptor_layout);
+    inline size_t operator()(
+        std::pair<VkDescriptorSetLayout, uint32_t> key) const {
+      size_t seed = 0;
+      seed = utils::hash_combine(
+          seed, std::hash<VkDescriptorSetLayout>()(key.first));
+      seed = utils::hash_combine(seed, std::hash<uint32_t>()(key.second));
+      return seed;
     }
   };
 
@@ -212,7 +216,7 @@ class PipelineLayoutCache final {
   std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
-  VkPipelineLayout retrieve(const Key&);
+  VkPipelineLayout retrieve(const VkDescriptorSetLayout, const uint32_t);
   void purge();
 };