Update on "[ET-VK] Enable dynamic operator registration"

Jorge Pineda · Jorge Pineda · commit 56754063dd10 · 2024-03-07T17:09:38.000-08:00
This change follows 1. in the footsteps of #2222 for static initialization and 2. the popular `TorchLibraryImpl` for wrapping with macros. https://www.internalfb.com/code/fbsource/[b6860acf0fd7a95224f2ed3f6fe48f699a9a45c0]/fbcode/caffe2/torch/library.h?lines=1004%2C1012-1026 Contributors can now write their operator and register them within the same file using `REGISTER_OPERATORS` + `VK_REGISTER_OP()`, as shown in `Arithmetic.h/cpp`. Typically in Linux/Android C++ environments, the symbols corresponding to `OperatorRegisterInit` static instances are discarded since they aren't used for anything other than static initialization. Hence, we need to `link_whole = True` for the `vulkan_graph_runtime` library. We update our Compute API tests to verify we can go through `OperatorRegistry` with proper static initialization. Differential Revision: [D54641117](https://our.internmc.facebook.com/intern/diff/D54641117/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -35,6 +35,9 @@ struct StagingParams final {
 
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
 
+// Expose for the Vulkan Compute API tests.
+StagingParams create_staging_params(const vTensor& t);
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -10,8 +10,7 @@
 
 #include <ATen/native/vulkan/api/api.h>
 
-#include <ATen/native/vulkan/impl/Arithmetic.h>
-#include <ATen/native/vulkan/impl/Packing.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OpUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/StagingUtils.h>
@@ -21,10 +20,6 @@
 
 using namespace at::native::vulkan;
 
-//
-// Utilities
-//
-
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \
   vTensor(                                           \
       api::context(),                                \
@@ -43,23 +38,159 @@ using namespace at::native::vulkan;
       api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
       allocate_memory);
 
+//
+// Simplified versions of ATen Vulkan legacy functions
+//
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+bool record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_dst));
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      get_nchw_to_image_shader(v_dst),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      params.buffer());
+}
+
+bool record_image_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  api::utils::uvec3 global_size = v_src.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(context, create_staging_params(v_src));
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      get_image_to_nchw_shader(v_src),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_src.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      dst_buffer,
+      params.buffer());
+}
+
+void record_arithmetic_op(
+    api::Context* const context,
+    const api::ShaderInfo& compute_shader,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst,
+    const float alpha) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  ArithmeticParams block{
+      get_size_as_ivec4(v_dst),
+      get_size_as_ivec4(v_in1),
+      get_size_as_ivec4(v_in2),
+      alpha,
+  };
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      compute_shader,
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      params.buffer());
+}
+
+//
+// Utilities
+//
+
 void fill_vtensor(vTensor& vten, std::vector<float>& data) {
   api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size());
 
   copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_nchw_to_buffer_op(
-        api::context(), staging_buffer.buffer(), vten, {}, VK_NULL_HANDLE);
+    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
   } else {
-    api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(vten);
-    packing::record_nchw_to_image_op(
-        api::context(),
-        compute_shader,
-        staging_buffer.buffer(),
-        vten,
-        {},
-        VK_NULL_HANDLE);
+    record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
   }
 }
 
@@ -75,17 +206,9 @@ void extract_vtensor(vTensor& vten, std::vector<float>& data) {
       api::context(), api::kFloat, vten.gpu_numel());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_buffer_to_nchw_op(
-        api::context(), vten, staging_buffer.buffer(), {}, VK_NULL_HANDLE);
+    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   } else {
-    api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(vten);
-    packing::record_image_to_nchw_op(
-        api::context(),
-        compute_shader,
-        vten,
-        staging_buffer.buffer(),
-        {},
-        VK_NULL_HANDLE);
+    record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   }
 
   api::VulkanFence fence = api::context()->fences().get_fence();
@@ -208,14 +331,14 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
   // Add shader kernel
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Fill input tensors
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Extract output tensor
   std::vector<float> data_out(c.gpu_numel());
@@ -244,7 +367,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   std::vector<float> data_b(b.gpu_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // Allocate memory at the last possible opportunity
   api::MemoryAllocation a_mem = allocate_memory_for(a);
@@ -260,7 +383,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   std::vector<float> data_c(c.gpu_numel());
   extract_vtensor(c, data_c);
@@ -310,20 +433,20 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   std::fill(data_d.begin(), data_d.end(), 1.0f);
 
   // Get shader kernel for add
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
+  api::ShaderInfo kernel = VK_KERNEL(add);
 
   // First, fill a and b with data
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_arithmetic_op(api::context(), kernel, a, b, c, 1.0f);
 
   // Now d can be filled with data
   fill_vtensor(d, data_d);
 
   // c + d -> e
-  arithmetic::record_op(api::context(), kernel, c, d, e, 1.0f);
+  record_arithmetic_op(api::context(), kernel, c, d, e, 1.0f);
 
   // Extract data from e
   std::vector<float> data_e(e.gpu_numel());