diff --git a/.gitmodules b/.gitmodules
index e9d6d0020..cf89bdae6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -47,6 +47,3 @@
 [submodule "external/SPIRV-Tools"]
 	path = external/SPIRV-Tools
 	url = https://github.com/KhronosGroup/SPIRV-Tools
-[submodule "external/SPIRV-Cross"]
-	path = external/SPIRV-Cross
-	url = https://github.com/KhronosGroup/SPIRV-Cross
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index db4f6bf3b..3d6e6d2a0 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -92,12 +92,7 @@ file(GLOB TAICHI_OPENGL_REQUIRED_SOURCE
   "taichi/backends/opengl/codegen_opengl.*"
   "taichi/backends/opengl/struct_opengl.*"
 )
-file(GLOB TAICHI_VULKAN_REQUIRED_SOURCE
-  "taichi/backends/vulkan/runtime.h"
-  "taichi/backends/vulkan/runtime.cpp"
-  "taichi/backends/vulkan/snode_struct_compiler.cpp"
-  "taichi/backends/vulkan/snode_struct_compiler.h"
-)
+file(GLOB TAICHI_VULKAN_REQUIRED_SOURCE "taichi/backends/vulkan/runtime.h" "taichi/backends/vulkan/runtime.cpp")
 
 list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE})
 
@@ -261,10 +256,6 @@ else()
     message(STATUS "TI_WITH_CUDA_TOOLKIT = OFF")
 endif()
 
-add_subdirectory(external/SPIRV-Cross)
-target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Cross)
-target_link_libraries(${CORE_LIBRARY_NAME} spirv-cross-glsl spirv-cross-core)
-
 if (TI_WITH_VULKAN)
     # Vulkan libs
     # https://cmake.org/cmake/help/latest/module/FindVulkan.html
diff --git a/examples/minimal.py b/examples/minimal.py
index ce7c87a32..81e1d6c30 100644
--- a/examples/minimal.py
+++ b/examples/minimal.py
@@ -4,9 +4,8 @@
 
 
 @ti.kernel
-def p() -> ti.f32:
+def p():
     print(42)
-    return 40 + 2
 
 
-print(p())
+p()
diff --git a/external/SPIRV-Cross b/external/SPIRV-Cross
deleted file mode 160000
index 97a438d21..000000000
--- a/external/SPIRV-Cross
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 97a438d214b24e4958ca137a18639670648cedd0
diff --git a/taichi/backends/device.h b/taichi/backends/device.h
index f9e5ff59f..e66697f94 100644
--- a/taichi/backends/device.h
+++ b/taichi/backends/device.h
@@ -12,27 +12,25 @@ namespace lang {
 // Or the backend runtime itself
 // Capabilities are per-device
 enum class DeviceCapability : uint32_t {
-  // Vulkan Caps
   vk_api_version,
+  vk_spirv_version,
   vk_has_physical_features2,
+  vk_has_int8,
+  vk_has_int16,
+  vk_has_int64,
+  vk_has_float16,
+  vk_has_float64,
   vk_has_external_memory,
+  vk_has_atomic_i64,
+  vk_has_atomic_float,  // load, store, exchange
+  vk_has_atomic_float_add,
+  vk_has_atomic_float_minmax,
+  vk_has_atomic_float64,  // load, store, exchange
+  vk_has_atomic_float64_add,
+  vk_has_atomic_float64_minmax,
   vk_has_surface,
   vk_has_presentation,
-  // SPIR-V Caps
-  spirv_version,
-  spirv_has_int8,
-  spirv_has_int16,
-  spirv_has_int64,
-  spirv_has_float16,
-  spirv_has_float64,
-  spirv_has_atomic_i64,
-  spirv_has_atomic_float,  // load, store, exchange
-  spirv_has_atomic_float_add,
-  spirv_has_atomic_float_minmax,
-  spirv_has_atomic_float64,  // load, store, exchange
-  spirv_has_atomic_float64_add,
-  spirv_has_atomic_float64_minmax,
-  spirv_has_variable_ptr,
+  vk_has_spv_variable_ptr,
 };
 
 class Device;
diff --git a/taichi/backends/vulkan/codegen_vulkan.cpp b/taichi/backends/vulkan/codegen_vulkan.cpp
index 8eb6dcf55..44d7d2136 100644
--- a/taichi/backends/vulkan/codegen_vulkan.cpp
+++ b/taichi/backends/vulkan/codegen_vulkan.cpp
@@ -825,7 +825,7 @@ class TaskCodegen : public IRVisitor {
     spirv::Value data = ir_->query_value(stmt->val->raw_name());
     spirv::Value val;
     if (dt->is_primitive(PrimitiveTypeID::f32)) {
-      if (device_->get_cap(DeviceCapability::spirv_has_atomic_float_add) &&
+      if (device_->get_cap(DeviceCapability::vk_has_atomic_float_add) &&
           stmt->op_type == AtomicOpType::add && is_compiled_struct) {
         val = ir_->make_value(
             spv::OpAtomicFAddEXT, ir_->get_primitive_type(dt), addr_ptr,
@@ -1024,7 +1024,7 @@ class TaskCodegen : public IRVisitor {
         task_attribs_.advisory_num_threads_per_group, 1, 1};
     ir_->set_work_group_size(group_size);
     std::vector<spirv::Value> buffers;
-    if (device_->get_cap(DeviceCapability::spirv_version) > 0x10300) {
+    if (device_->get_cap(DeviceCapability::vk_spirv_version) > 0x10300) {
       for (const auto &bb : task_attribs_.buffer_binds) {
         const auto it = buffer_value_map_.find(bb.buffer);
         if (it != buffer_value_map_.end()) {
@@ -1140,23 +1140,12 @@ class TaskCodegen : public IRVisitor {
     // For now, |total_invocs_name| is equal to |total_elems|. Once we support
     // dynamic range, they will be different.
     // https://www.khronos.org/opengl/wiki/Compute_Shader#Inputs
-
-    // HLSL & WGSL cross compilers do not support this builtin
-    /*
     spirv::Value total_invocs = ir_->cast(
         ir_->i32_type(),
         ir_->mul(ir_->get_num_work_groups(0),
                  ir_->uint_immediate_number(
                      ir_->u32_type(),
                      task_attribs_.advisory_num_threads_per_group, true)));
-                     */
-    const int group_x = (task_attribs_.advisory_total_num_threads +
-                         task_attribs_.advisory_num_threads_per_group - 1) /
-                        task_attribs_.advisory_num_threads_per_group;
-    spirv::Value total_invocs = ir_->uint_immediate_number(
-        ir_->i32_type(), group_x * task_attribs_.advisory_num_threads_per_group,
-        false);
-
     ir_->debug(spv::OpName, total_invocs, total_invocs_name);
 
     // Must get init label after making value(to make sure they are correct)
@@ -1390,15 +1379,12 @@ class KernelCodegen {
 
       // Enable to dump SPIR-V assembly of kernels
 #if 0
-      std::string spirv_asm;
-      spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
-      TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n", params_.ti_kernel_name,
-              spirv_asm);
-
-      std::ofstream fout((params_.ti_kernel_name).c_str(),
-                         std::ios::binary | std::ios::out);
-      fout.write(reinterpret_cast<const char *>(task_res.spirv_code.data()),
-                 task_res.spirv_code.size() * sizeof(uint32_t));
+       std::string spirv_asm;
+       spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
+      TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n",params_.ti_kernel_name, spirv_asm);
+
+      std::ofstream fout((params_.ti_kernel_name).c_str(), std::ios::binary | std::ios::out);
+      fout.write(reinterpret_cast<const char*>(task_res.spirv_code.data()), task_res.spirv_code.size() * sizeof(uint32_t));
       fout.close();
 #endif
 
diff --git a/taichi/backends/vulkan/embedded_device.cpp b/taichi/backends/vulkan/embedded_device.cpp
index 689ce45b8..e418581fb 100644
--- a/taichi/backends/vulkan/embedded_device.cpp
+++ b/taichi/backends/vulkan/embedded_device.cpp
@@ -187,6 +187,8 @@ EmbeddedVulkanDevice::EmbeddedVulkanDevice(
   pick_physical_device();
   create_logical_device();
 
+  // TODO: Change the ownership hierarchy, the taichi Device class should be at
+  // the top level
   {
     VulkanDevice::Params params;
     params.instance = instance_;
@@ -215,6 +217,10 @@ EmbeddedVulkanDevice::~EmbeddedVulkanDevice() {
   vkDestroyInstance(instance_, kNoVkAllocCallbacks);
 }
 
+Device *EmbeddedVulkanDevice::get_ti_device() const {
+  return ti_device_.get();
+}
+
 void EmbeddedVulkanDevice::create_instance() {
   VkApplicationInfo app_info{};
   app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
@@ -375,10 +381,10 @@ void EmbeddedVulkanDevice::create_logical_device() {
   vkGetPhysicalDeviceProperties(physical_device_, &physical_device_properties);
   ti_device_->set_cap(DeviceCapability::vk_api_version,
                       physical_device_properties.apiVersion);
-  ti_device_->set_cap(DeviceCapability::spirv_version, 0x10000);
+  ti_device_->set_cap(DeviceCapability::vk_spirv_version, 0x10000);
 
   if (physical_device_properties.apiVersion >= VK_API_VERSION_1_1) {
-    ti_device_->set_cap(DeviceCapability::spirv_version, 0x10300);
+    ti_device_->set_cap(DeviceCapability::vk_spirv_version, 0x10300);
   }
 
   // Detect extensions
@@ -421,7 +427,7 @@ void EmbeddedVulkanDevice::create_logical_device() {
     } else if (name == VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME) {
       enabled_extensions.push_back(ext.extensionName);
     } else if (name == VK_KHR_SPIRV_1_4_EXTENSION_NAME) {
-      ti_device_->set_cap(DeviceCapability::spirv_version, 0x10400);
+      ti_device_->set_cap(DeviceCapability::vk_spirv_version, 0x10400);
       enabled_extensions.push_back(ext.extensionName);
     } else if (name == VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME) {
       ti_device_->set_cap(DeviceCapability::vk_has_external_memory, true);
@@ -448,15 +454,15 @@ void EmbeddedVulkanDevice::create_logical_device() {
 
   if (device_supported_features.shaderInt16) {
     device_features.shaderInt16 = true;
-    ti_device_->set_cap(DeviceCapability::spirv_has_int16, true);
+    ti_device_->set_cap(DeviceCapability::vk_has_int16, true);
   }
   if (device_supported_features.shaderInt64) {
     device_features.shaderInt64 = true;
-    ti_device_->set_cap(DeviceCapability::spirv_has_int64, true);
+    ti_device_->set_cap(DeviceCapability::vk_has_int64, true);
   }
   if (device_supported_features.shaderFloat64) {
     device_features.shaderFloat64 = true;
-    ti_device_->set_cap(DeviceCapability::spirv_has_float64, true);
+    ti_device_->set_cap(DeviceCapability::vk_has_float64, true);
   }
   if (device_supported_features.wideLines) {
     device_features.wideLines = true;
@@ -493,7 +499,7 @@ void EmbeddedVulkanDevice::create_logical_device() {
 
       if (variable_ptr_feature.variablePointers &&
           variable_ptr_feature.variablePointersStorageBuffer) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_variable_ptr, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_spv_variable_ptr, true);
       }
       *pNextEnd = &variable_ptr_feature;
       pNextEnd = &variable_ptr_feature.pNext;
@@ -505,14 +511,13 @@ void EmbeddedVulkanDevice::create_logical_device() {
       vkGetPhysicalDeviceFeatures2KHR(physical_device_, &features2);
 
       if (shader_atomic_float_feature.shaderBufferFloat32AtomicAdd) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_atomic_float_add, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_atomic_float_add, true);
       } else if (shader_atomic_float_feature.shaderBufferFloat64AtomicAdd) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_atomic_float64_add,
-                            true);
+        ti_device_->set_cap(DeviceCapability::vk_has_atomic_float64_add, true);
       } else if (shader_atomic_float_feature.shaderBufferFloat32Atomics) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_atomic_float, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_atomic_float, true);
       } else if (shader_atomic_float_feature.shaderBufferFloat64Atomics) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_atomic_float64, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_atomic_float64, true);
       }
       *pNextEnd = &shader_atomic_float_feature;
       pNextEnd = &shader_atomic_float_feature.pNext;
@@ -524,9 +529,9 @@ void EmbeddedVulkanDevice::create_logical_device() {
       vkGetPhysicalDeviceFeatures2KHR(physical_device_, &features2);
 
       if (shader_f16_i8_feature.shaderFloat16) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_float16, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_float16, true);
       } else if (shader_f16_i8_feature.shaderInt8) {
-        ti_device_->set_cap(DeviceCapability::spirv_has_int8, true);
+        ti_device_->set_cap(DeviceCapability::vk_has_int8, true);
       }
       *pNextEnd = &shader_f16_i8_feature;
       pNextEnd = &shader_f16_i8_feature.pNext;
@@ -549,6 +554,8 @@ void EmbeddedVulkanDevice::create_logical_device() {
   if (params_.is_for_ui) {
     vkGetDeviceQueue(device_, queue_family_indices_.graphics_family.value(), 0,
                      &graphics_queue_);
+    vkGetDeviceQueue(device_, queue_family_indices_.graphics_family.value(), 0,
+                     &present_queue_);
   }
 
   vkGetDeviceQueue(device_, queue_family_indices_.compute_family.value(), 0,
diff --git a/taichi/backends/vulkan/embedded_device.h b/taichi/backends/vulkan/embedded_device.h
index 1be3f7e11..4b70bac35 100644
--- a/taichi/backends/vulkan/embedded_device.h
+++ b/taichi/backends/vulkan/embedded_device.h
@@ -61,14 +61,36 @@ class EmbeddedVulkanDevice {
   explicit EmbeddedVulkanDevice(const Params &params);
   ~EmbeddedVulkanDevice();
 
-  const VulkanDevice *device() const {
-    return ti_device_.get();
+  VkInstance instance() {
+    return instance_;
   }
 
   VulkanDevice *device() {
     return ti_device_.get();
   }
 
+  const VulkanDevice *device() const {
+    return ti_device_.get();
+  }
+
+  VkPhysicalDevice physical_device() const {
+    return physical_device_;
+  }
+
+  VkSurfaceKHR surface() const {
+    return surface_;
+  }
+
+  VkInstance instance() const {
+    return instance_;
+  }
+
+  const VulkanQueueFamilyIndices &queue_family_indices() const {
+    return queue_family_indices_;
+  }
+
+  Device *get_ti_device() const;
+
  private:
   void create_instance();
   void setup_debug_messenger();
@@ -81,12 +103,19 @@ class EmbeddedVulkanDevice {
   VkPhysicalDevice physical_device_{VK_NULL_HANDLE};
   VulkanQueueFamilyIndices queue_family_indices_;
   VkDevice device_{VK_NULL_HANDLE};
-
+  // TODO: It's probably not right to put these per-queue things here. However,
+  // in Taichi we only use a single queue on a single device (i.e. a single CUDA
+  // stream), so it doesn't make a difference.
   VkQueue compute_queue_{VK_NULL_HANDLE};
   VkQueue graphics_queue_{VK_NULL_HANDLE};
+  VkQueue present_queue_{VK_NULL_HANDLE};
 
   VkSurfaceKHR surface_{VK_NULL_HANDLE};
 
+  // TODO: Shall we have dedicated command pools for COMPUTE and TRANSFER
+  // commands, respectively?
+  VkCommandPool command_pool_{VK_NULL_HANDLE};
+
   std::unique_ptr<VulkanDevice> ti_device_{nullptr};
 
   Params params_;
diff --git a/taichi/backends/vulkan/loader.cpp b/taichi/backends/vulkan/loader.cpp
index d71a5f069..2e4eaae3b 100644
--- a/taichi/backends/vulkan/loader.cpp
+++ b/taichi/backends/vulkan/loader.cpp
@@ -40,10 +40,6 @@ PFN_vkVoidFunction VulkanLoader::load_function(const char *name) {
   return result;
 }
 
-bool is_vulkan_api_available() {
-  return VulkanLoader::instance().init();
-}
-
 }  // namespace vulkan
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/loader.h b/taichi/backends/vulkan/loader.h
index 220a7411e..4fbda6bb6 100644
--- a/taichi/backends/vulkan/loader.h
+++ b/taichi/backends/vulkan/loader.h
@@ -35,8 +35,6 @@ class VulkanLoader {
   VkDevice vulkan_device_{VK_NULL_HANDLE};
 };
 
-bool is_vulkan_api_available();
-
 }  // namespace vulkan
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/runtime.cpp b/taichi/backends/vulkan/runtime.cpp
index b942c05e6..ae6d5d6ab 100644
--- a/taichi/backends/vulkan/runtime.cpp
+++ b/taichi/backends/vulkan/runtime.cpp
@@ -9,6 +9,18 @@
 #include <unordered_set>
 #include <vector>
 
+#include "taichi/util/environ_config.h"
+
+#ifdef TI_WITH_VULKAN
+#include "taichi/backends/vulkan/embedded_device.h"
+#include "taichi/backends/vulkan/vulkan_utils.h"
+#include "taichi/backends/vulkan/loader.h"
+
+#include "vk_mem_alloc.h"
+#include "taichi/backends/vulkan/vulkan_device.h"
+#endif  // TI_WITH_VULKAN
+
+#include "taichi/math/arithmetic.h"
 #define TI_RUNTIME_HOST
 #include "taichi/program/context.h"
 #undef TI_RUNTIME_HOST
@@ -17,6 +29,8 @@ namespace taichi {
 namespace lang {
 namespace vulkan {
 
+#ifdef TI_WITH_VULKAN
+
 namespace {
 class StopWatch {
  public:
@@ -38,6 +52,16 @@ class StopWatch {
   std::chrono::time_point<std::chrono::system_clock> begin_;
 };
 
+using BufferType = TaskAttributes::BufferType;
+using BufferInfo = TaskAttributes::BufferInfo;
+using BufferBind = TaskAttributes::BufferBind;
+using BufferInfoHasher = TaskAttributes::BufferInfoHasher;
+
+// TODO: In the future this isn't necessarily a pointer, since DeviceAllocation
+// is already a pretty cheap handle>
+using InputBuffersMap =
+    std::unordered_map<BufferInfo, DeviceAllocation *, BufferInfoHasher>;
+
 class HostDeviceContextBlitter {
  public:
   HostDeviceContextBlitter(const KernelContextAttributes *ctx_attribs,
@@ -79,22 +103,22 @@ class HostDeviceContextBlitter {
           std::memcpy(device_ptr, host_ptr, arg.stride);
           break;
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_int8)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int8)) {
           TO_DEVICE(i8, int8)
           TO_DEVICE(u8, uint8)
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_int16)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int16)) {
           TO_DEVICE(i16, int16)
           TO_DEVICE(u16, uint16)
         }
         TO_DEVICE(i32, int32)
         TO_DEVICE(u32, uint32)
         TO_DEVICE(f32, float32)
-        if (device_->get_cap(DeviceCapability::spirv_has_int64)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int64)) {
           TO_DEVICE(i64, int64)
           TO_DEVICE(u64, uint64)
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_float64)) {
+        if (device_->get_cap(DeviceCapability::vk_has_float64)) {
           TO_DEVICE(f64, float64)
         }
         TI_ERROR("Vulkan does not support arg type={}", data_type_name(arg.dt));
@@ -161,22 +185,22 @@ class HostDeviceContextBlitter {
           std::memcpy(host_ptr, device_ptr, ret.stride);
           break;
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_int8)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int8)) {
           TO_HOST(i8, int8)
           TO_HOST(u8, uint8)
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_int16)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int16)) {
           TO_HOST(i16, int16)
           TO_HOST(u16, uint16)
         }
         TO_HOST(i32, int32)
         TO_HOST(u32, uint32)
         TO_HOST(f32, float32)
-        if (device_->get_cap(DeviceCapability::spirv_has_int64)) {
+        if (device_->get_cap(DeviceCapability::vk_has_int64)) {
           TO_HOST(i64, int64)
           TO_HOST(u64, uint64)
         }
-        if (device_->get_cap(DeviceCapability::spirv_has_float64)) {
+        if (device_->get_cap(DeviceCapability::vk_has_float64)) {
           TO_HOST(f64, float64)
         }
         TI_ERROR("Vulkan does not support return value type={}",
@@ -212,225 +236,358 @@ class HostDeviceContextBlitter {
   Device *const device_;
 };
 
-}  // namespace
-
 // Info for launching a compiled Taichi kernel, which consists of a series of
 // Vulkan pipelines.
+class CompiledTaichiKernel {
+ public:
+  struct Params {
+    const TaichiKernelAttributes *ti_kernel_attribs{nullptr};
+    std::vector<std::vector<uint32_t>> spirv_bins;
+    std::vector<CompiledSNodeStructs> compiled_structs;
+
+    VulkanDevice *device{nullptr};
+    std::vector<DeviceAllocation *> root_buffers;
+    DeviceAllocation *global_tmps_buffer{nullptr};
+  };
+
+  CompiledTaichiKernel(const Params &ti_params)
+      : ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
+        device_(ti_params.device) {
+    input_buffers_[BufferType::GlobalTmps] = ti_params.global_tmps_buffer;
+    for (int root = 0; root < ti_params.compiled_structs.size(); ++root) {
+      BufferInfo buffer = {BufferType::Root, root};
+      input_buffers_[buffer] = ti_params.root_buffers[root];
+    }
+    const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
+    if (!ti_kernel_attribs_.ctx_attribs.empty()) {
+      Device::AllocParams params;
+      ctx_buffer_ = ti_params.device->allocate_memory_unique(
+          {size_t(ctx_sz),
+           /*host_write=*/true, /*host_read=*/false,
+           /*export_sharing=*/false, AllocUsage::Storage});
+      ctx_buffer_host_ = ti_params.device->allocate_memory_unique(
+          {size_t(ctx_sz),
+           /*host_write=*/false, /*host_read=*/true,
+           /*export_sharing=*/false, AllocUsage::Storage});
+      input_buffers_[BufferType::Context] = ctx_buffer_.get();
+    }
 
-CompiledTaichiKernel::CompiledTaichiKernel(const Params &ti_params)
-    : ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
-      device_(ti_params.device) {
-  input_buffers_[BufferType::GlobalTmps] = ti_params.global_tmps_buffer;
-  for (int root = 0; root < ti_params.compiled_structs.size(); ++root) {
-    BufferInfo buffer = {BufferType::Root, root};
-    input_buffers_[buffer] = ti_params.root_buffers[root];
-  }
-  const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
-  if (!ti_kernel_attribs_.ctx_attribs.empty()) {
-    Device::AllocParams params;
-    ctx_buffer_ = ti_params.device->allocate_memory_unique(
-        {size_t(ctx_sz),
-         /*host_write=*/true, /*host_read=*/false,
-         /*export_sharing=*/false, AllocUsage::Storage});
-    ctx_buffer_host_ = ti_params.device->allocate_memory_unique(
-        {size_t(ctx_sz),
-         /*host_write=*/false, /*host_read=*/true,
-         /*export_sharing=*/false, AllocUsage::Storage});
-    input_buffers_[BufferType::Context] = ctx_buffer_.get();
+    const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
+    const auto &spirv_bins = ti_params.spirv_bins;
+    TI_ASSERT(task_attribs.size() == spirv_bins.size());
+
+    for (int i = 0; i < task_attribs.size(); ++i) {
+      PipelineSourceDesc source_desc{PipelineSourceType::spirv_binary,
+                                     (void *)spirv_bins[i].data(),
+                                     spirv_bins[i].size() * sizeof(uint32_t)};
+      auto vp = ti_params.device->create_pipeline(source_desc,
+                                                  ti_kernel_attribs_.name);
+      pipelines_.push_back(std::move(vp));
+    }
   }
 
-  const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
-  const auto &spirv_bins = ti_params.spirv_bins;
-  TI_ASSERT(task_attribs.size() == spirv_bins.size());
+  const TaichiKernelAttributes &ti_kernel_attribs() const {
+    return ti_kernel_attribs_;
+  }
 
-  for (int i = 0; i < task_attribs.size(); ++i) {
-    PipelineSourceDesc source_desc{PipelineSourceType::spirv_binary,
-                                   (void *)spirv_bins[i].data(),
-                                   spirv_bins[i].size() * sizeof(uint32_t)};
-    auto vp =
-        ti_params.device->create_pipeline(source_desc, ti_kernel_attribs_.name);
-    pipelines_.push_back(std::move(vp));
+  size_t num_pipelines() const {
+    return pipelines_.size();
   }
-}
 
-const TaichiKernelAttributes &CompiledTaichiKernel::ti_kernel_attribs() const {
-  return ti_kernel_attribs_;
-}
+  DeviceAllocation *ctx_buffer() const {
+    return ctx_buffer_.get();
+  }
 
-size_t CompiledTaichiKernel::num_pipelines() const {
-  return pipelines_.size();
-}
+  DeviceAllocation *ctx_buffer_host() const {
+    return ctx_buffer_host_.get();
+  }
 
-DeviceAllocation *CompiledTaichiKernel::ctx_buffer() const {
-  return ctx_buffer_.get();
-}
+  void command_list(CommandList *cmdlist) const {
+    const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
+
+    for (int i = 0; i < task_attribs.size(); ++i) {
+      const auto &attribs = task_attribs[i];
+      auto vp = pipelines_[i].get();
+      const int group_x = (attribs.advisory_total_num_threads +
+                           attribs.advisory_num_threads_per_group - 1) /
+                          attribs.advisory_num_threads_per_group;
+      ResourceBinder *binder = vp->resource_binder();
+      for (auto &bind : attribs.buffer_binds) {
+        binder->rw_buffer(0, bind.binding, *input_buffers_.at(bind.buffer));
+      }
 
-DeviceAllocation *CompiledTaichiKernel::ctx_buffer_host() const {
-  return ctx_buffer_host_.get();
-}
+      cmdlist->bind_pipeline(vp);
+      cmdlist->bind_resources(binder);
+      cmdlist->dispatch(group_x);
+      cmdlist->memory_barrier();
+    }
 
-void CompiledTaichiKernel::command_list(CommandList *cmdlist) const {
-  const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
-
-  for (int i = 0; i < task_attribs.size(); ++i) {
-    const auto &attribs = task_attribs[i];
-    auto vp = pipelines_[i].get();
-    const int group_x = (attribs.advisory_total_num_threads +
-                         attribs.advisory_num_threads_per_group - 1) /
-                        attribs.advisory_num_threads_per_group;
-    ResourceBinder *binder = vp->resource_binder();
-    for (auto &bind : attribs.buffer_binds) {
-      binder->rw_buffer(0, bind.binding, *input_buffers_.at(bind.buffer));
+    const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
+    if (!ti_kernel_attribs_.ctx_attribs.empty()) {
+      cmdlist->buffer_copy(ctx_buffer_host_->get_ptr(0),
+                           ctx_buffer_->get_ptr(0), ctx_sz);
+      cmdlist->buffer_barrier(*ctx_buffer_host_);
     }
+  }
+
+ private:
+  TaichiKernelAttributes ti_kernel_attribs_;
+  std::vector<TaskAttributes> tasks_attribs_;
+
+  Device *device_;
+
+  InputBuffersMap input_buffers_;
 
-    cmdlist->bind_pipeline(vp);
-    cmdlist->bind_resources(binder);
-    cmdlist->dispatch(group_x);
-    cmdlist->memory_barrier();
+  // Right now |ctx_buffer_| is allocated from a HOST_VISIBLE|COHERENT
+  // memory, because we do not do computation on this buffer anyway, and it may
+  // not worth the effort doing another hop via a staging buffer.
+  // TODO: Provide an option to use staging buffer. This could be useful if the
+  // kernel does lots of IO on the context buffer, e.g., copy a large np array.
+  std::unique_ptr<DeviceAllocationGuard> ctx_buffer_{nullptr};
+  std::unique_ptr<DeviceAllocationGuard> ctx_buffer_host_{nullptr};
+  std::vector<std::unique_ptr<Pipeline>> pipelines_;
+};
+
+}  // namespace
+
+class VkRuntime ::Impl {
+ public:
+  explicit Impl(const Params &params)
+      : host_result_buffer_(params.host_result_buffer) {
+    TI_ASSERT(host_result_buffer_ != nullptr);
+    EmbeddedVulkanDevice::Params evd_params;
+    evd_params.api_version = VulkanEnvSettings::kApiVersion();
+    embedded_device_ = std::make_unique<EmbeddedVulkanDevice>(evd_params);
+    device_ = embedded_device_->get_ti_device();
+
+    init_buffers();
   }
 
-  const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
-  if (!ti_kernel_attribs_.ctx_attribs.empty()) {
-    cmdlist->buffer_copy(ctx_buffer_host_->get_ptr(0), ctx_buffer_->get_ptr(0),
-                         ctx_sz);
-    cmdlist->buffer_barrier(*ctx_buffer_host_);
+  ~Impl() {
+    {
+      decltype(ti_kernels_) tmp;
+      tmp.swap(ti_kernels_);
+    }
+    global_tmps_buffer_.reset();
   }
-}
 
-VkRuntime::VkRuntime(const Params &params)
-    : host_result_buffer_(params.host_result_buffer), device_(params.device) {
-  TI_ASSERT(host_result_buffer_ != nullptr);
-  init_buffers();
-}
+  void materialize_snode_tree(SNodeTree *tree) {
+    auto *const root = tree->root();
+    CompiledSNodeStructs compiled_structs =
+        vulkan::compile_snode_structs(*root);
+    add_root_buffer(compiled_structs.root_size);
+    compiled_snode_structs_.push_back(compiled_structs);
+  }
 
-VkRuntime::~VkRuntime() {
-  {
-    decltype(ti_kernels_) tmp;
-    tmp.swap(ti_kernels_);
+  void destroy_snode_tree(SNodeTree *snode_tree) {
+    int root_id = -1;
+    for (int i = 0; i < compiled_snode_structs_.size(); ++i) {
+      if (compiled_snode_structs_[i].root == snode_tree->root()) {
+        root_id = i;
+      }
+    }
+    if (root_id == -1) {
+      TI_ERROR("the tree to be destroyed cannot be found");
+    }
+    root_buffers_[root_id].reset();
   }
-  global_tmps_buffer_.reset();
-}
 
-void VkRuntime::materialize_snode_tree(SNodeTree *tree) {
-  auto *const root = tree->root();
-  CompiledSNodeStructs compiled_structs = vulkan::compile_snode_structs(*root);
-  add_root_buffer(compiled_structs.root_size);
-  compiled_snode_structs_.push_back(compiled_structs);
-}
+  const std::vector<CompiledSNodeStructs> &get_compiled_structs() const {
+    return compiled_snode_structs_;
+  }
 
-void VkRuntime::destroy_snode_tree(SNodeTree *snode_tree) {
-  int root_id = -1;
-  for (int i = 0; i < compiled_snode_structs_.size(); ++i) {
-    if (compiled_snode_structs_[i].root == snode_tree->root()) {
-      root_id = i;
+  KernelHandle register_taichi_kernel(RegisterParams reg_params) {
+    CompiledTaichiKernel::Params params;
+    params.ti_kernel_attribs = &(reg_params.kernel_attribs);
+    params.compiled_structs = get_compiled_structs();
+    params.device = embedded_device_->device();
+    params.root_buffers = {};
+    for (int root = 0; root < root_buffers_.size(); ++root) {
+      params.root_buffers.push_back(root_buffers_[root].get());
+    }
+    params.global_tmps_buffer = global_tmps_buffer_.get();
+
+    for (int i = 0; i < reg_params.task_spirv_source_codes.size(); ++i) {
+      const auto &attribs = reg_params.kernel_attribs.tasks_attribs[i];
+      const auto &spirv_src = reg_params.task_spirv_source_codes[i];
+      const auto &task_name = attribs.name;
+
+      // If we can reach here, we have succeeded. Otherwise
+      // std::optional::value() would have killed us.
+      params.spirv_bins.push_back(std::move(spirv_src));
+    }
+    KernelHandle res;
+    res.id_ = ti_kernels_.size();
+    ti_kernels_.push_back(std::make_unique<CompiledTaichiKernel>(params));
+    return res;
+  }
+
+  void launch_kernel(KernelHandle handle, Context *host_ctx) {
+    auto *ti_kernel = ti_kernels_[handle.id_].get();
+    auto ctx_blitter = HostDeviceContextBlitter::maybe_make(
+        &ti_kernel->ti_kernel_attribs().ctx_attribs, host_ctx, device_,
+        host_result_buffer_, ti_kernel->ctx_buffer(),
+        ti_kernel->ctx_buffer_host());
+    if (ctx_blitter) {
+      TI_ASSERT(ti_kernel->ctx_buffer() != nullptr);
+      ctx_blitter->host_to_device();
+    }
+
+    if (!current_cmdlist_) {
+      current_cmdlist_ = device_->get_compute_stream()->new_command_list();
+    }
+
+    ti_kernel->command_list(current_cmdlist_.get());
+
+    if (ctx_blitter) {
+      device_->get_compute_stream()->submit(current_cmdlist_.get());
+      ctx_blitter->device_to_host();
+
+      current_cmdlist_ = nullptr;
     }
   }
-  if (root_id == -1) {
-    TI_ERROR("the tree to be destroyed cannot be found");
+
+  void synchronize() {
+    device_->get_compute_stream()->command_sync();
   }
-  root_buffers_[root_id].reset();
-}
 
-const std::vector<CompiledSNodeStructs> &VkRuntime::get_compiled_structs()
-    const {
-  return compiled_snode_structs_;
-}
+  Device *get_ti_device() const {
+    return device_;
+  }
 
-VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
-    VkRuntime::RegisterParams reg_params) {
-  CompiledTaichiKernel::Params params;
-  params.ti_kernel_attribs = &(reg_params.kernel_attribs);
-  params.compiled_structs = get_compiled_structs();
-  params.device = device_;
-  params.root_buffers = {};
-  for (int root = 0; root < root_buffers_.size(); ++root) {
-    params.root_buffers.push_back(root_buffers_[root].get());
-  }
-  params.global_tmps_buffer = global_tmps_buffer_.get();
-
-  for (int i = 0; i < reg_params.task_spirv_source_codes.size(); ++i) {
-    const auto &attribs = reg_params.kernel_attribs.tasks_attribs[i];
-    const auto &spirv_src = reg_params.task_spirv_source_codes[i];
-    const auto &task_name = attribs.name;
-
-    // If we can reach here, we have succeeded. Otherwise
-    // std::optional::value() would have killed us.
-    params.spirv_bins.push_back(std::move(spirv_src));
-  }
-  KernelHandle res;
-  res.id_ = ti_kernels_.size();
-  ti_kernels_.push_back(std::make_unique<CompiledTaichiKernel>(params));
-  return res;
-}
+ private:
+  void init_buffers() {
+    size_t gtmp_buffer_size = 1024 * 1024;
 
-void VkRuntime::launch_kernel(KernelHandle handle, Context *host_ctx) {
-  auto *ti_kernel = ti_kernels_[handle.id_].get();
-  auto ctx_blitter = HostDeviceContextBlitter::maybe_make(
-      &ti_kernel->ti_kernel_attribs().ctx_attribs, host_ctx, device_,
-      host_result_buffer_, ti_kernel->ctx_buffer(),
-      ti_kernel->ctx_buffer_host());
-  if (ctx_blitter) {
-    TI_ASSERT(ti_kernel->ctx_buffer() != nullptr);
-    ctx_blitter->host_to_device();
+    global_tmps_buffer_ = device_->allocate_memory_unique(
+        {gtmp_buffer_size,
+         /*host_write=*/false, /*host_read=*/false,
+         /*export_sharing=*/false, AllocUsage::Storage});
+
+    // Need to zero fill the buffers, otherwise there could be NaN.
+    Stream *stream = device_->get_compute_stream();
+    auto cmdlist = stream->new_command_list();
+
+    cmdlist->buffer_fill(global_tmps_buffer_->get_ptr(0), gtmp_buffer_size,
+                         /*data=*/0);
+    stream->submit_synced(cmdlist.get());
   }
 
-  if (!current_cmdlist_) {
-    current_cmdlist_ = device_->get_compute_stream()->new_command_list();
+  void add_root_buffer(size_t root_buffer_size) {
+    if (root_buffer_size == 0) {
+      root_buffer_size = 4;  // there might be empty roots
+    }
+    std::unique_ptr<DeviceAllocationGuard> new_buffer =
+        device_->allocate_memory_unique(
+            {root_buffer_size,
+             /*host_write=*/false, /*host_read=*/false,
+             /*export_sharing=*/false, AllocUsage::Storage});
+    Stream *stream = device_->get_compute_stream();
+    auto cmdlist = stream->new_command_list();
+    cmdlist->buffer_fill(new_buffer->get_ptr(0), root_buffer_size, /*data=*/0);
+    stream->submit_synced(cmdlist.get());
+    root_buffers_.push_back(std::move(new_buffer));
   }
 
-  ti_kernel->command_list(current_cmdlist_.get());
+  uint64_t *const host_result_buffer_;
 
-  if (ctx_blitter) {
-    device_->get_compute_stream()->submit(current_cmdlist_.get());
-    ctx_blitter->device_to_host();
+  std::unique_ptr<EmbeddedVulkanDevice> embedded_device_{nullptr};
 
-    current_cmdlist_ = nullptr;
+  std::vector<std::unique_ptr<DeviceAllocationGuard>> root_buffers_;
+  std::unique_ptr<DeviceAllocationGuard> global_tmps_buffer_;
+
+  Device *device_;
+
+  std::unique_ptr<CommandList> current_cmdlist_{nullptr};
+
+  std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;
+
+  std::vector<CompiledSNodeStructs> compiled_snode_structs_;
+};
+
+#else
+
+class VkRuntime::Impl {
+ public:
+  Impl(const Params &) {
+    TI_ERROR("Vulkan disabled");
   }
-}
 
-void VkRuntime::synchronize() {
-  if (current_cmdlist_) {
-    device_->get_compute_stream()->submit(current_cmdlist_.get());
-    current_cmdlist_ = nullptr;
+  KernelHandle register_taichi_kernel(RegisterParams) {
+    TI_ERROR("Vulkan disabled");
+    return KernelHandle();
+  }
+
+  void launch_kernel(KernelHandle, Context *) {
+    TI_ERROR("Vulkan disabled");
   }
-  device_->get_compute_stream()->command_sync();
+
+  void synchronize() {
+    TI_ERROR("Vulkan disabled");
+  }
+
+  void materialize_snode_tree(SNodeTree *tree) {
+    TI_ERROR("Vulkan disabled");
+  }
+
+  const std::vector<CompiledSNodeStructs> &get_compiled_structs() const {
+    TI_ERROR("Vulkan disabled");
+  }
+
+  void destroy_snode_tree(SNodeTree *snode_tree) {
+    TI_ERROR("Vulkan disabled");
+  }
+};
+
+#endif  // TI_WITH_VULKAN
+
+VkRuntime::VkRuntime(const Params &params)
+    : impl_(std::make_unique<Impl>(params)) {
 }
 
-Device *VkRuntime::get_ti_device() const {
-  return device_;
+VkRuntime::~VkRuntime() {
+}
+
+VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
+    RegisterParams params) {
+  return impl_->register_taichi_kernel(std::move(params));
+}
+
+void VkRuntime::launch_kernel(KernelHandle handle, Context *host_ctx) {
+  impl_->launch_kernel(handle, host_ctx);
 }
 
-void VkRuntime::init_buffers() {
-  size_t gtmp_buffer_size = 1024 * 1024;
+void VkRuntime::synchronize() {
+  impl_->synchronize();
+}
 
-  global_tmps_buffer_ = device_->allocate_memory_unique(
-      {gtmp_buffer_size,
-       /*host_write=*/false, /*host_read=*/false,
-       /*export_sharing=*/false, AllocUsage::Storage});
+void VkRuntime::materialize_snode_tree(SNodeTree *tree) {
+  impl_->materialize_snode_tree(tree);
+}
 
-  // Need to zero fill the buffers, otherwise there could be NaN.
-  Stream *stream = device_->get_compute_stream();
-  auto cmdlist = stream->new_command_list();
+const std::vector<CompiledSNodeStructs> &VkRuntime::get_compiled_structs()
+    const {
+  return impl_->get_compiled_structs();
+}
 
-  cmdlist->buffer_fill(global_tmps_buffer_->get_ptr(0), gtmp_buffer_size,
-                       /*data=*/0);
-  stream->submit_synced(cmdlist.get());
+void VkRuntime::destroy_snode_tree(SNodeTree *snode_tree) {
+  return impl_->destroy_snode_tree(snode_tree);
 }
 
-void VkRuntime::add_root_buffer(size_t root_buffer_size) {
-  if (root_buffer_size == 0) {
-    root_buffer_size = 4;  // there might be empty roots
-  }
-  std::unique_ptr<DeviceAllocationGuard> new_buffer =
-      device_->allocate_memory_unique(
-          {root_buffer_size,
-           /*host_write=*/false, /*host_read=*/false,
-           /*export_sharing=*/false, AllocUsage::Storage});
-  Stream *stream = device_->get_compute_stream();
-  auto cmdlist = stream->new_command_list();
-  cmdlist->buffer_fill(new_buffer->get_ptr(0), root_buffer_size, /*data=*/0);
-  stream->submit_synced(cmdlist.get());
-  root_buffers_.push_back(std::move(new_buffer));
+Device *VkRuntime::get_ti_device() const {
+#ifdef TI_WITH_VULKAN
+  return impl_->get_ti_device();
+#else
+  return nullptr;
+#endif
+}
+
+bool is_vulkan_api_available() {
+#ifdef TI_WITH_VULKAN
+  return VulkanLoader::instance().init();
+#else
+  return false;
+#endif
 }
 
 }  // namespace vulkan
diff --git a/taichi/backends/vulkan/runtime.h b/taichi/backends/vulkan/runtime.h
index d1389277a..76ce8144a 100644
--- a/taichi/backends/vulkan/runtime.h
+++ b/taichi/backends/vulkan/runtime.h
@@ -14,58 +14,13 @@ namespace taichi {
 namespace lang {
 namespace vulkan {
 
-using BufferType = TaskAttributes::BufferType;
-using BufferInfo = TaskAttributes::BufferInfo;
-using BufferBind = TaskAttributes::BufferBind;
-using BufferInfoHasher = TaskAttributes::BufferInfoHasher;
-
-// TODO: In the future this isn't necessarily a pointer, since DeviceAllocation
-// is already a pretty cheap handle>
-using InputBuffersMap =
-    std::unordered_map<BufferInfo, DeviceAllocation *, BufferInfoHasher>;
-
-class CompiledTaichiKernel {
- public:
-  struct Params {
-    const TaichiKernelAttributes *ti_kernel_attribs{nullptr};
-    std::vector<std::vector<uint32_t>> spirv_bins;
-    std::vector<CompiledSNodeStructs> compiled_structs;
-
-    Device *device{nullptr};
-    std::vector<DeviceAllocation *> root_buffers;
-    DeviceAllocation *global_tmps_buffer{nullptr};
-  };
-
-  CompiledTaichiKernel(const Params &ti_params);
-
-  const TaichiKernelAttributes &ti_kernel_attribs() const;
-
-  size_t num_pipelines() const;
-
-  DeviceAllocation *ctx_buffer() const;
-
-  DeviceAllocation *ctx_buffer_host() const;
-
-  void command_list(CommandList *cmdlist) const;
-
+class VkRuntime {
  private:
-  TaichiKernelAttributes ti_kernel_attribs_;
-  std::vector<TaskAttributes> tasks_attribs_;
-
-  Device *device_;
+  class Impl;
 
-  InputBuffersMap input_buffers_;
-
-  std::unique_ptr<DeviceAllocationGuard> ctx_buffer_{nullptr};
-  std::unique_ptr<DeviceAllocationGuard> ctx_buffer_host_{nullptr};
-  std::vector<std::unique_ptr<Pipeline>> pipelines_;
-};
-
-class VkRuntime {
  public:
   struct Params {
-    uint64_t *host_result_buffer{nullptr};
-    Device *device{nullptr};
+    uint64_t *host_result_buffer = nullptr;
   };
 
   explicit VkRuntime(const Params &params);
@@ -74,7 +29,7 @@ class VkRuntime {
 
   class KernelHandle {
    private:
-    friend class VkRuntime;
+    friend class Impl;
     int id_ = -1;
   };
 
@@ -98,23 +53,11 @@ class VkRuntime {
   const std::vector<CompiledSNodeStructs> &get_compiled_structs() const;
 
  private:
-  void init_buffers();
-  void add_root_buffer(size_t root_buffer_size);
-
-  Device *device_;
-
-  uint64_t *const host_result_buffer_;
-
-  std::vector<std::unique_ptr<DeviceAllocationGuard>> root_buffers_;
-  std::unique_ptr<DeviceAllocationGuard> global_tmps_buffer_;
-
-  std::unique_ptr<CommandList> current_cmdlist_{nullptr};
-
-  std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;
-
-  std::vector<CompiledSNodeStructs> compiled_snode_structs_;
+  std::unique_ptr<Impl> impl_;
 };
 
+bool is_vulkan_api_available();
+
 }  // namespace vulkan
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/spirv_ir_builder.cpp b/taichi/backends/vulkan/spirv_ir_builder.cpp
index 4be026a40..37ac969d8 100644
--- a/taichi/backends/vulkan/spirv_ir_builder.cpp
+++ b/taichi/backends/vulkan/spirv_ir_builder.cpp
@@ -12,9 +12,9 @@ void IRBuilder::init_header() {
   TI_ASSERT(header_.size() == 0U);
   header_.push_back(spv::MagicNumber);
 
-  header_.push_back(device_->get_cap(cap::spirv_version));
+  header_.push_back(device_->get_cap(cap::vk_spirv_version));
 
-  TI_TRACE("SPIR-V Version {}", device_->get_cap(cap::spirv_version));
+  TI_TRACE("SPIR-V Version {}", device_->get_cap(cap::vk_spirv_version));
 
   // generator: set to 0, unknown
   header_.push_back(0U);
@@ -26,48 +26,46 @@ void IRBuilder::init_header() {
   // capability
   ib_.begin(spv::OpCapability).add(spv::CapabilityShader).commit(&header_);
 
-  if (device_->get_cap(cap::spirv_has_atomic_float64_add)) {
+  if (device_->get_cap(cap::vk_has_atomic_float64_add)) {
     ib_.begin(spv::OpCapability)
         .add(spv::CapabilityAtomicFloat64AddEXT)
         .commit(&header_);
   }
 
-  if (device_->get_cap(cap::spirv_has_atomic_float_add)) {
+  if (device_->get_cap(cap::vk_has_atomic_float_add)) {
     ib_.begin(spv::OpCapability)
         .add(spv::CapabilityAtomicFloat32AddEXT)
         .commit(&header_);
   }
 
-  if (device_->get_cap(cap::spirv_has_atomic_float_minmax)) {
+  if (device_->get_cap(cap::vk_has_atomic_float_minmax)) {
     ib_.begin(spv::OpCapability)
         .add(spv::CapabilityAtomicFloat32MinMaxEXT)
         .commit(&header_);
   }
 
-  if (device_->get_cap(cap::spirv_has_variable_ptr)) {
-    /*
+  if (device_->get_cap(cap::vk_has_spv_variable_ptr)) {
     ib_.begin(spv::OpCapability)
         .add(spv::CapabilityVariablePointers)
         .commit(&header_);
     ib_.begin(spv::OpCapability)
         .add(spv::CapabilityVariablePointersStorageBuffer)
         .commit(&header_);
-        */
   }
 
-  if (device_->get_cap(cap::spirv_has_int8)) {
+  if (device_->get_cap(cap::vk_has_int8)) {
     ib_.begin(spv::OpCapability).add(spv::CapabilityInt8).commit(&header_);
   }
-  if (device_->get_cap(cap::spirv_has_int16)) {
+  if (device_->get_cap(cap::vk_has_int16)) {
     ib_.begin(spv::OpCapability).add(spv::CapabilityInt16).commit(&header_);
   }
-  if (device_->get_cap(cap::spirv_has_int64)) {
+  if (device_->get_cap(cap::vk_has_int64)) {
     ib_.begin(spv::OpCapability).add(spv::CapabilityInt64).commit(&header_);
   }
-  if (device_->get_cap(cap::spirv_has_float16)) {
+  if (device_->get_cap(cap::vk_has_float16)) {
     ib_.begin(spv::OpCapability).add(spv::CapabilityFloat16).commit(&header_);
   }
-  if (device_->get_cap(cap::spirv_has_float64)) {
+  if (device_->get_cap(cap::vk_has_float64)) {
     ib_.begin(spv::OpCapability).add(spv::CapabilityFloat64).commit(&header_);
   }
 
@@ -75,19 +73,19 @@ void IRBuilder::init_header() {
       .add("SPV_KHR_storage_buffer_storage_class")
       .commit(&header_);
 
-  if (device_->get_cap(cap::spirv_has_variable_ptr)) {
+  if (device_->get_cap(cap::vk_has_spv_variable_ptr)) {
     ib_.begin(spv::OpExtension)
         .add("SPV_KHR_variable_pointers")
         .commit(&header_);
   }
 
-  if (device_->get_cap(cap::spirv_has_atomic_float_add)) {
+  if (device_->get_cap(cap::vk_has_atomic_float_add)) {
     ib_.begin(spv::OpExtension)
         .add("SPV_EXT_shader_atomic_float_add")
         .commit(&header_);
   }
 
-  if (device_->get_cap(cap::spirv_has_atomic_float_minmax)) {
+  if (device_->get_cap(cap::vk_has_atomic_float_minmax)) {
     ib_.begin(spv::OpExtension)
         .add("SPV_EXT_shader_atomic_float_min_max")
         .commit(&header_);
@@ -120,22 +118,22 @@ std::vector<uint32_t> IRBuilder::finalize() {
 void IRBuilder::init_pre_defs() {
   ext_glsl450_ = ext_inst_import("GLSL.std.450");
   t_bool_ = declare_primitive_type(get_data_type<bool>());
-  if (device_->get_cap(cap::spirv_has_int8)) {
+  if (device_->get_cap(cap::vk_has_int8)) {
     t_int8_ = declare_primitive_type(get_data_type<int8>());
     t_uint8_ = declare_primitive_type(get_data_type<uint8>());
   }
-  if (device_->get_cap(cap::spirv_has_int16)) {
+  if (device_->get_cap(cap::vk_has_int16)) {
     t_int16_ = declare_primitive_type(get_data_type<int16>());
     t_uint16_ = declare_primitive_type(get_data_type<uint16>());
   }
   t_int32_ = declare_primitive_type(get_data_type<int32>());
   t_uint32_ = declare_primitive_type(get_data_type<uint32>());
-  if (device_->get_cap(cap::spirv_has_int64)) {
+  if (device_->get_cap(cap::vk_has_int64)) {
     t_int64_ = declare_primitive_type(get_data_type<int64>());
     t_uint64_ = declare_primitive_type(get_data_type<uint64>());
   }
   t_fp32_ = declare_primitive_type(get_data_type<float32>());
-  if (device_->get_cap(cap::spirv_has_float64)) {
+  if (device_->get_cap(cap::vk_has_float64)) {
     t_fp64_ = declare_primitive_type(get_data_type<float64>());
   }
   // declare void, and void functions
@@ -212,35 +210,35 @@ SType IRBuilder::get_primitive_type(const DataType &dt) const {
   } else if (dt->is_primitive(PrimitiveTypeID::f32)) {
     return t_fp32_;
   } else if (dt->is_primitive(PrimitiveTypeID::f64)) {
-    if (!device_->get_cap(cap::spirv_has_float64))
+    if (!device_->get_cap(cap::vk_has_float64))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_fp64_;
   } else if (dt->is_primitive(PrimitiveTypeID::i8)) {
-    if (!device_->get_cap(cap::spirv_has_int8))
+    if (!device_->get_cap(cap::vk_has_int8))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_int8_;
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
-    if (!device_->get_cap(cap::spirv_has_int16))
+    if (!device_->get_cap(cap::vk_has_int16))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_int16_;
   } else if (dt->is_primitive(PrimitiveTypeID::i32)) {
     return t_int32_;
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
-    if (!device_->get_cap(cap::spirv_has_int64))
+    if (!device_->get_cap(cap::vk_has_int64))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_int64_;
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
-    if (!device_->get_cap(cap::spirv_has_int8))
+    if (!device_->get_cap(cap::vk_has_int8))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_uint8_;
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
-    if (!device_->get_cap(cap::spirv_has_int16))
+    if (!device_->get_cap(cap::vk_has_int16))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_uint16_;
   } else if (dt->is_primitive(PrimitiveTypeID::u32)) {
     return t_uint32_;
   } else if (dt->is_primitive(PrimitiveTypeID::u64)) {
-    if (!device_->get_cap(cap::spirv_has_int64))
+    if (!device_->get_cap(cap::vk_has_int64))
       TI_ERROR("Type {} not supported.", dt->to_string());
     return t_uint64_;
   } else {
@@ -252,13 +250,13 @@ SType IRBuilder::get_primitive_buffer_type(const bool struct_compiled,
                                            const DataType &dt) const {
   if (struct_compiled) {
     if (dt->is_primitive(PrimitiveTypeID::f32) &&
-        device_->get_cap(cap::spirv_has_atomic_float_add)) {
+        device_->get_cap(cap::vk_has_atomic_float_add)) {
       return t_fp32_;
     } else if (dt->is_primitive(PrimitiveTypeID::f64) &&
-               device_->get_cap(cap::spirv_has_atomic_float64_add)) {
+               device_->get_cap(cap::vk_has_atomic_float64_add)) {
       return t_fp64_;
     } else if (dt->is_primitive(PrimitiveTypeID::i64) &&
-               device_->get_cap(cap::spirv_has_atomic_i64)) {
+               device_->get_cap(cap::vk_has_atomic_i64)) {
       return t_int64_;
     }
   }
@@ -334,7 +332,7 @@ SType IRBuilder::get_struct_array_type(const SType &value_type,
       .add_seq(struct_type, 0, spv::DecorationOffset, 0)
       .commit(&decorate_);
 
-  if (device_->get_cap(cap::spirv_version) < 0x10300) {
+  if (device_->get_cap(cap::vk_spirv_version) < 0x10300) {
     // NOTE: BufferBlock was deprecated in SPIRV 1.3
     // use StorageClassStorageBuffer instead.
     // runtime array are always decorated as BufferBlock(shader storage buffer)
@@ -354,7 +352,7 @@ Value IRBuilder::buffer_argument(const SType &value_type,
   // NOTE: BufferBlock was deprecated in SPIRV 1.3
   // use StorageClassStorageBuffer instead.
   spv::StorageClass storage_class;
-  if (device_->get_cap(cap::spirv_version) < 0x10300) {
+  if (device_->get_cap(cap::vk_spirv_version) < 0x10300) {
     storage_class = spv::StorageClassUniform;
   } else {
     storage_class = spv::StorageClassStorageBuffer;
@@ -380,7 +378,7 @@ Value IRBuilder::struct_array_access(const SType &res_type,
   TI_ASSERT(res_type.flag == TypeKind::kPrimitive);
 
   spv::StorageClass storage_class;
-  if (device_->get_cap(cap::spirv_version) < 0x10300) {
+  if (device_->get_cap(cap::vk_spirv_version) < 0x10300) {
     storage_class = spv::StorageClassUniform;
   } else {
     storage_class = spv::StorageClassStorageBuffer;
@@ -910,10 +908,6 @@ void IRBuilder::init_random_function(Value global_tmp_) {
   _rand_y_ = new_value(local_type, ValueKind::kVariablePtr);
   _rand_z_ = new_value(local_type, ValueKind::kVariablePtr);
   _rand_w_ = new_value(local_type, ValueKind::kVariablePtr);
-  global_values.push_back(_rand_x_);
-  global_values.push_back(_rand_y_);
-  global_values.push_back(_rand_z_);
-  global_values.push_back(_rand_w_);
   ib_.begin(spv::OpVariable)
       .add_seq(local_type, _rand_x_, spv::StorageClassPrivate)
       .commit(&global_);
diff --git a/taichi/backends/vulkan/spirv_ir_builder.h b/taichi/backends/vulkan/spirv_ir_builder.h
index dcc116bff..a47a15d0b 100644
--- a/taichi/backends/vulkan/spirv_ir_builder.h
+++ b/taichi/backends/vulkan/spirv_ir_builder.h
@@ -332,8 +332,6 @@ class IRBuilder {
     return new_value(t_void_func_, ValueKind::kFunction);
   }
 
-  std::vector<Value> global_values;
-
   // Declare the entry point for a kernel function
   void commit_kernel_function(const Value &func,
                               const std::string &name,
@@ -344,11 +342,6 @@ class IRBuilder {
     for (const auto &arg : args) {
       ib_.add(arg);
     }
-    if (device_->get_cap(DeviceCapability::spirv_version) >= 0x10400) {
-      for (const auto &v : global_values) {
-        ib_.add(v);
-      }
-    }
     if (gl_global_invocation_id.id != 0) {
       ib_.add(gl_global_invocation_id);
     }
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index 80f749456..7c03bd371 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -16,13 +16,8 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
   *result_buffer_ptr = (uint64 *)memory_pool->allocate(
       sizeof(uint64) * taichi_result_buffer_entries, 8);
 
-  EmbeddedVulkanDevice::Params evd_params;
-  evd_params.api_version = VulkanEnvSettings::kApiVersion();
-  embedded_device_ = std::make_unique<EmbeddedVulkanDevice>(evd_params);
-
   vulkan::VkRuntime::Params params;
   params.host_result_buffer = *result_buffer_ptr;
-  params.device = embedded_device_->device();
   vulkan_runtime_ = std::make_unique<vulkan::VkRuntime>(std::move(params));
 }
 
@@ -31,13 +26,9 @@ void VulkanProgramImpl::materialize_snode_tree(
     std::vector<std::unique_ptr<SNodeTree>> &,
     std::unordered_map<int, SNode *> &,
     uint64 *result_buffer) {
+  // TODO: support materializing multiple snode trees
   vulkan_runtime_->materialize_snode_tree(tree);
 }
 
-VulkanProgramImpl::~VulkanProgramImpl() {
-  vulkan_runtime_.reset();
-  embedded_device_.reset();
-}
-
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index dd2fcf55e..ce72b87df 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -8,22 +8,10 @@
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/program/program_impl.h"
 
-#include "taichi/backends/vulkan/embedded_device.h"
-#include "taichi/backends/vulkan/vulkan_utils.h"
-#include "taichi/backends/vulkan/loader.h"
-
-#include "vk_mem_alloc.h"
-#include "taichi/backends/vulkan/vulkan_device.h"
-
 #include <optional>
 
 namespace taichi {
 namespace lang {
-
-namespace vulkan {
-class EmbeddedVulkanDevice;
-}
-
 class VulkanProgramImpl : public ProgramImpl {
  public:
   VulkanProgramImpl(CompileConfig &config) : ProgramImpl(config) {
@@ -58,10 +46,10 @@ class VulkanProgramImpl : public ProgramImpl {
     vulkan_runtime_->destroy_snode_tree(snode_tree);
   }
 
-  ~VulkanProgramImpl() override;
+  ~VulkanProgramImpl() {
+  }
 
  private:
-  std::unique_ptr<vulkan::EmbeddedVulkanDevice> embedded_device_{nullptr};
   std::unique_ptr<vulkan::VkRuntime> vulkan_runtime_;
 };
 }  // namespace lang
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index a2352e210..cd67d310c 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -28,7 +28,7 @@
 #endif
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/vulkan_program.h"
-#include "taichi/backends/vulkan/loader.h"
+#include "taichi/backends/vulkan/runtime.h"
 #endif
 
 #if defined(TI_ARCH_x64)
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index b73efbba6..bcda0684e 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -23,10 +23,6 @@
 #include "taichi/backends/cuda/cuda_driver.h"
 #endif
 
-#ifdef TI_WITH_VULKAN
-#include "taichi/backends/vulkan/loader.h"
-#endif
-
 #ifdef TI_WITH_CC
 namespace taichi::lang::cccp {
 extern bool is_c_backend_available();
@@ -170,11 +166,7 @@ void export_misc(py::module &m) {
   m.def("with_cuda", is_cuda_api_available);
   m.def("with_metal", taichi::lang::metal::is_metal_api_available);
   m.def("with_opengl", taichi::lang::opengl::is_opengl_api_available);
-#ifdef TI_WITH_VULKAN
   m.def("with_vulkan", taichi::lang::vulkan::is_vulkan_api_available);
-#else
-  m.def("with_vulkan", []() { return false; });
-#endif
 
 #ifdef TI_WITH_CC
   m.def("with_cc", taichi::lang::cccp::is_c_backend_available);