diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index ccd420f17..ce564e2b0 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -3,12 +3,12 @@ Related issue = #
 <!--
 Thank you for your contribution!
 
-If it is your first time contributing to Taichi, please read our Contributor Guideline:
-  https://taichi.graphics/contribution/
+If it is your first time contributing to Taichi, please read our Contributor Guidelines:
+  https://docs.taichi.graphics/docs/lang/articles/contribution/contributor_guide
 
 - Please always prepend your PR title with tags such as [CUDA], [Lang], [Doc], [Example]. For a complete list of valid PR tags, please check out https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
 - Use upper-case tags (e.g., [Metal]) for PRs that change public APIs. Otherwise, please use lower-case tags (e.g., [metal]).
-- More details: https://taichi.graphics/contribution/contributor_guide.html#pr-title-format-and-tags
+- More details: https://docs.taichi.graphics/docs/lang/articles/contribution/contributor_guide#pr-title-format-and-tags
 
 - Please fill in the issue number that this PR relates to.
 - If your PR fixes the issue **completely**, use the `close` or `fixes` prefix so that GitHub automatically closes the issue when the PR is merged. For example,
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 9c78bb7c3..265de5bc2 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -72,7 +72,7 @@ jobs:
 
       - name: Build
         run: |
-          export TAICHI_REPO_DIR=`pwd`
+          TAICHI_REPO_DIR=`pwd`
           export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
           export CXX=clang++
           python misc/ci_setup.py ci
@@ -81,13 +81,16 @@ jobs:
 
       - name: Test
         run: |
-          export TAICHI_REPO_DIR=`pwd`
-          export PATH=$TAICHI_REPO_DIR/bin:$PATH
+          TAICHI_REPO_DIR=`pwd`
           export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
-          export PYTHONPATH=$TAICHI_REPO_DIR/python
+          # Note we only need this since we cannot write into system python package.
+          export PATH=$PATH:$HOME/.local/bin
+          hash -r
           python examples/algorithm/laplace.py
           ti diagnose
-          ./build/taichi_cpp_tests
+          # TODO: make cpp tests work in both release & dev mode.
+          # Currently cpp tests only works in dev mode since it depends on the path set there.
+          TAICHI_REPO_DIR=$TAICHI_REPO_DIR ./build/taichi_cpp_tests
           ti test -vr2 -t2
 
   build_and_test_cpu:
@@ -126,22 +129,30 @@ jobs:
 
       - name: Build
         run: |
-          export TAICHI_REPO_DIR=`pwd`
+          TAICHI_REPO_DIR=`pwd`
           export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
           export CXX=clang++
           python misc/ci_setup.py ci
         env:
           CI_SETUP_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_BUILD_TESTS:BOOL=${{ matrix.with_cpp_tests }}
 
+      # [DEBUG] Copy this step around to enable debugging inside Github Action instances.
+      #- name: Setup tmate session
+      #  uses: mxschmitt/action-tmate@v3
+      #  with:
+      #    limit-access-to-actor: true
+
       - name: Test
         run: |
-          export TAICHI_REPO_DIR=`pwd`
-          export PATH=$TAICHI_REPO_DIR/bin:$PATH
+          TAICHI_REPO_DIR=`pwd`
           export PATH=$TAICHI_REPO_DIR/taichi-llvm/bin/:$PATH
-          export PYTHONPATH=$TAICHI_REPO_DIR/python
+          export PATH=$PATH:$HOME/.local/bin
+          hash -r
           python examples/algorithm/laplace.py
           ti diagnose
-          [ "$RUN_CPP_TESTS" = "ON" ] && ./build/taichi_cpp_tests
+          # TODO: make cpp tests work in both release & dev mode.
+          # Currently cpp tests only works in dev mode since it depends on the path set there.
+          [ "$RUN_CPP_TESTS" = "ON" ] && TAICHI_REPO_DIR=$TAICHI_REPO_DIR ./build/taichi_cpp_tests
           ti test -vr2 -t2
         env:
           RUN_CPP_TESTS: ${{ matrix.with_cpp_tests }}
@@ -156,7 +167,6 @@ jobs:
       - name: Build
         run: |
           git --version
-          export TAICHI_REPO_DIR=`pwd`
           export PATH=/home/github/taichi-llvm/bin/:$PATH
           export CXX=clang++-8
           export PYTHON=/usr/bin/python3
@@ -167,11 +177,10 @@ jobs:
       - name: Test
         run: |
           export PYTHON=/usr/bin/python3
-          export TAICHI_REPO_DIR=`pwd`
-          export PATH=$TAICHI_REPO_DIR/bin:$PATH
           export PATH=/home/github/taichi-llvm/bin/:$PATH
-          export PYTHONPATH=$TAICHI_REPO_DIR/python
+          export PATH=$PATH:$HOME/.local/bin
           export DISPLAY=:1
+          hash -r
           glewinfo
           $PYTHON examples/algorithm/laplace.py
           ti diagnose
@@ -215,7 +224,7 @@ jobs:
           python misc/ci_setup.py ci
           mkdir build
           cd build
-          cmake .. -G"Visual Studio 16 2019" -A x64 -DPYTHON_EXECUTABLE="$env:PYTHON" -DLLVM_DIR="C:\taichi_llvm\lib\cmake\llvm" 
+          cmake .. -G"Visual Studio 16 2019" -A x64 -DPYTHON_EXECUTABLE="$env:PYTHON" -DLLVM_DIR="C:\taichi_llvm\lib\cmake\llvm"
           msbuild /p:Configuration=RelWithDebInfo /p:Platform=x64 /m taichi.sln
           cd ..
         env:
diff --git a/.gitignore b/.gitignore
index eac2a5518..4d7f24297 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,7 @@ runtimes/*.png
 runtimes/taichi_core*
 runtimes/libtaichi_core*
 .pypirc
-python/dist
+dist/
 python/MANIFEST
 build/
 bin/
@@ -54,6 +54,8 @@ __pycache__
 *.ppm
 *.png
 *.jpg
+!docs/**/*.jpg
+!docs/**/*.png
 *.egg-info
 .tlang_cache
 /taichi/common/version.h
@@ -76,3 +78,4 @@ _build
 *.yml
 *.dot
 *.json
+!docs/**/*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b494f769..181ed4b34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,9 +6,9 @@ cmake_minimum_required(VERSION 3.12)
 
 project(taichi)
 
-SET(TI_VERSION_MAJOR 0)
-SET(TI_VERSION_MINOR 7)
-SET(TI_VERSION_PATCH 25)
+SET(TI_VERSION_MAJOR ${TI_VERSION_MAJOR})
+SET(TI_VERSION_MINOR ${TI_VERSION_MINOR})
+SET(TI_VERSION_PATCH ${TI_VERSION_PATCH})
 
 set(CMAKE_CXX_STANDARD 17)
 
@@ -76,6 +76,31 @@ add_custom_target(
 )
 add_dependencies(${CORE_LIBRARY_NAME} generate_commit_hash)
 
+if (TI_WITH_CUDA)
+    set(CUDA_ARCH "cuda")
+endif()
+
+find_program(CLANG_EXECUTABLE NAMES clang clang-7 clang-8 clang-9 clang-10)
+if (NOT CLANG_EXECUTABLE)
+    message(FATAL_ERROR "Cannot find any clang executable.")
+endif()
+
+find_program(LLVM_AS_EXECUTABLE NAMES llvm-as)
+if (NOT LLVM_AS_EXECUTABLE)
+    message(FATAL_ERROR "Cannot find llvm-as executable.")
+endif()
+
+# Build llvm-runtime for host arch and cuda (if available)
+foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
+  add_custom_target(
+      "generate_llvm_runtime_${arch}"
+      COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
+      COMMAND ${LLVM_AS_EXECUTABLE} runtime.ll -o "runtime_${arch}.bc"
+      WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm"
+  )
+  add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
+endforeach()
+
 FILE(WRITE ${CMAKE_CURRENT_LIST_DIR}/taichi/common/version.h
         "#pragma once\n"
         "#define TI_VERSION_MAJOR \"${TI_VERSION_MAJOR}\"\n"
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..42c7e6054
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,14 @@
+include MANIFEST.in
+include python/*.txt
+include python/*.py
+include *.cfg
+include python/taichi/*.md
+include python/taichi/assets/*
+include python/taichi/examples/*
+include python/taichi/tests/*
+include python/taichi/lib/*.so
+include python/taichi/lib/*.pyd
+include python/taichi/lib/*.bc
+
+global-exclude *.pyc *.pyo
+global-exclude ffmpeg
diff --git a/README.md b/README.md
index c09741971..44140aeba 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <div align="center">
   <img width="500px" src="https://github.com/taichi-dev/taichi/raw/master/misc/logo.png">
-  <h3> <a href="https://taichi.readthedocs.io/en/stable/hello.html"> Tutorial </a> | <a href="https://github.com/taichi-dev/taichi/tree/master/examples"> Examples </a> | <a href="https://forum.taichi.graphics/"> Forum </a> | <a href="http://hub.taichi.graphics/"> Playground </a> </h3>
-  <h3> <a href="https://taichi.readthedocs.io/en/stable/"> Documentation </a> | <a href="https://taichi.readthedocs.io/zh_CN/latest/"> 简体中文文档 </a> | <a href="https://taichi.readthedocs.io/en/stable/contributor_guide.html"> Contributor Guidelines </a> </h3>
+   <h3> <a href="https://docs.taichi.graphics/"> Tutorial </a> | <a href="https://github.com/taichi-dev/taichi/tree/master/examples"> Examples </a> | <a href="https://forum.taichi.graphics/"> Forum </a><!-- | <a href="http://hub.taichi.graphics/"> Playground </a> --></h3>
+  <h3> <a href="https://docs.taichi.graphics/"> Documentation </a> | <a href="https://docs.taichi.graphics/zh-Hans/docs/"> 简体中文文档 </a> | <a href="https://docs.taichi.graphics/docs/lang/articles/contribution/contributor_guide"> Contributor Guidelines </a> </h3>
 </div>
 
 [![AppVeyor Status](https://img.shields.io/appveyor/build/yuanming-hu/taichi?logo=AppVeyor&label=AppVeyor)](https://ci.appveyor.com/project/yuanming-hu/taichi/branch/master)
@@ -13,9 +13,9 @@
 
 **Taichi** (太极) is a programming language designed for *high-performance computer graphics*. It is deeply embedded in **Python**, and its **just-in-time compiler** offloads compute-intensive tasks to multi-core CPUs and massively parallel GPUs.
 
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/fractal.py#L1-L31"> <img src="https://github.com/taichi-dev/public_files/raw/master/taichi/fractal_code.png" height="270px"></a>  <img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/fractal_small.gif" height="270px">
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/simulation/fractal.py#L1-L31"> <img src="https://github.com/taichi-dev/public_files/raw/master/taichi/fractal_code.png" height="270px"></a>  <img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/fractal_small.gif" height="270px">
 
-Advanced features of Taichi include [spatially sparse computing](https://taichi.readthedocs.io/en/latest/sparse.html) and [differentiable programming](https://taichi.readthedocs.io/en/latest/differentiable_programming.html) [[examples]](https://github.com/yuanming-hu/difftaichi).
+Advanced features of Taichi include [spatially sparse computing](https://docs.taichi.graphics/docs/lang/articles/advanced/sparse) and [differentiable programming](https://docs.taichi.graphics/docs/lang/articles/advanced/differentiable_programming) [[examples]](https://github.com/yuanming-hu/difftaichi).
 
 **Please check out our SIGGRAPH 2020 course on Taichi basics:** [YouTube](https://youtu.be/Y0-76n3aZFA), [Bilibili](https://www.bilibili.com/video/BV1kA411n7jk/), [slides (pdf)](https://yuanming.taichi.graphics/publication/2020-taichi-tutorial/taichi-tutorial.pdf).
 
@@ -23,10 +23,10 @@ Advanced features of Taichi include [spatially sparse computing](https://taichi.
 
 ## Examples ([More...](misc/examples.md))
 
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/mpm128.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/mpm128.gif" height="192px"></a>
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/stable_fluid.py"> <img src="https://github.com/taichi-dev/public_files/raw/master/taichi/stable_fluids.gif" height="192px"></a>
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/sdf_renderer.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/sdf_renderer.jpg" height="192px"></a>
-<a href="https://github.com/taichi-dev/taichi/blob/master/examples/euler.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/euler.gif" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/simulation/mpm128.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/mpm128.gif" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/simulation/stable_fluid.py"> <img src="https://github.com/taichi-dev/public_files/raw/master/taichi/stable_fluids.gif" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/rendering/sdf_renderer.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/sdf_renderer.jpg" height="192px"></a>
+<a href="https://github.com/taichi-dev/taichi/blob/master/examples/simulation/euler.py"><img src="https://github.com/taichi-dev/public_files/raw/master/taichi/euler.gif" height="192px"></a>
 
 ## Installation [![Downloads](https://pepy.tech/badge/taichi)](https://pepy.tech/project/taichi)
 
@@ -67,3 +67,7 @@ If you use Taichi in your research, please cite our papers:
 - [Taichi Conference](https://github.com/taichi-dev/taichicon): Taichi developer conferences.
 - [GAMES 201 Lectures](https://github.com/taichi-dev/games201): (Chinese) A hands-on course on building advanced physics engines, based on Taichi.
 - [More...](misc/links.md)
+
+## Security
+
+Please disclose security issues responsibly by contacting contact@taichi.graphics.
diff --git a/cmake/PythonNumpyPybind11.cmake b/cmake/PythonNumpyPybind11.cmake
index bd351900a..5957afc79 100644
--- a/cmake/PythonNumpyPybind11.cmake
+++ b/cmake/PythonNumpyPybind11.cmake
@@ -14,7 +14,13 @@ endif ()
 
 if (WIN32)
     execute_process(COMMAND where ${PYTHON_EXECUTABLE}
-        OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATH)
+        OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATHS)
+    if (${PYTHON_EXECUTABLE_PATHS})
+        string(FIND ${PYTHON_EXECUTABLE_PATHS} "\n" _LINE_BREAK_LOC)
+        string(SUBSTRING ${PYTHON_EXECUTABLE_PATHS} 0 ${_LINE_BREAK_LOC} PYTHON_EXECUTABLE_PATH)
+    else ()
+        set(PYTHON_EXECUTABLE_PATH ${PYTHON_EXECUTABLE})
+    endif ()
 else ()
     execute_process(COMMAND which ${PYTHON_EXECUTABLE}
             OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATH)
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index e6f629163..030d58c6a 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -44,11 +44,14 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}"
         message("Setting -march=nehalem for x86_64 processors")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem -DTI_ARCH_x64")
     endif()
+    set(ARCH "x64")
 elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_ARCH_ARM")
+    set(ARCH "arm64")
 else()
     message(FATAL_ERROR "Unknown processor type ${CMAKE_SYSTEM_PROCESSOR}")
 endif()
+set(HOST_ARCH ${ARCH} CACHE INTERNAL "Host arch")
 
 if (USE_STDCPP)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 4404a1cda..d8f594474 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -23,6 +23,15 @@ include_directories(
 )
 
 add_executable(${TESTS_NAME} ${TAICHI_TESTS_SOURCE})
+if (WIN32)
+    # Output the executable to bin/ instead of build/Debug/...
+    set(TESTS_OUTPUT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/bin")
+    set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUTPUT_DIR})
+    set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG ${TESTS_OUTPUT_DIR})
+    set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${TESTS_OUTPUT_DIR})
+    set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR})
+    set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR})
+endif()
 target_link_libraries(${TESTS_NAME} taichi_isolated_core)
 target_link_libraries(${TESTS_NAME} gtest_main)
 
diff --git a/docs/lang/api/reference/_category_.json b/docs/lang/api/reference/_category_.json
new file mode 100644
index 000000000..ec4123c51
--- /dev/null
+++ b/docs/lang/api/reference/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "API Reference",
+  "position": 99
+}
diff --git a/docs/lang/articles/advanced/_category_.json b/docs/lang/articles/advanced/_category_.json
new file mode 100644
index 000000000..84fc16f93
--- /dev/null
+++ b/docs/lang/articles/advanced/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Advanced Programming",
+  "position": 3
+}
diff --git a/docs/lang/articles/basic/_category_.json b/docs/lang/articles/basic/_category_.json
new file mode 100644
index 000000000..60b7ef558
--- /dev/null
+++ b/docs/lang/articles/basic/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Taichi Language Basic Concepts",
+  "position": 2
+}
diff --git a/docs/lang/articles/contribution/_category_.json b/docs/lang/articles/contribution/_category_.json
new file mode 100644
index 000000000..74136a441
--- /dev/null
+++ b/docs/lang/articles/contribution/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Contribution Guide",
+  "position": 5
+}
diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/lang/articles/contribution/contributor_guide.md
index 7c1adb64c..b1d8d62ff 100644
--- a/docs/lang/articles/contribution/contributor_guide.md
+++ b/docs/lang/articles/contribution/contributor_guide.md
@@ -166,7 +166,7 @@ This design is terrible.
   `yapf v0.29.0` locally before you use `ti format`.
 
 - If you don't have these formatting tools locally, feel free to
-  leverage GitHub actions: simply comment `\format` in a PR
+  leverage GitHub actions: simply comment `/format` in a PR
   (e.g., [#2481](https://github.com/taichi-dev/taichi/pull/2481#issuecomment-872226701))
   and then [Taichi Gardener](https://github.com/taichi-gardener)
   will automatically format the code for you.
diff --git a/docs/lang/articles/contribution/life_of_kernel_lowres.jpg b/docs/lang/articles/contribution/life_of_kernel_lowres.jpg
new file mode 100644
index 000000000..0f7dbc332
Binary files /dev/null and b/docs/lang/articles/contribution/life_of_kernel_lowres.jpg differ
diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/lang/articles/contribution/writing_cpp_tests.md
index d41a42037..14792216d 100644
--- a/docs/lang/articles/contribution/writing_cpp_tests.md
+++ b/docs/lang/articles/contribution/writing_cpp_tests.md
@@ -21,6 +21,11 @@ make
 ./taichi_cpp_tests
 ```
 
+:::note
+On Windows, `taichi_cpp_tests.exe` will be placed inside the `%TAICHI_REPO_DIR%\bin` directory,
+so you can directly run it after adding the directory to `PATH` in [Setting up Taichi for development](dev_install.md#setting-up-taichi-for-development-3).
+:::
+
 :::note
 Consider polishing the CPP test infrastructure:
 
diff --git a/docs/lang/articles/misc/_category_.json b/docs/lang/articles/misc/_category_.json
new file mode 100644
index 000000000..da3cf0599
--- /dev/null
+++ b/docs/lang/articles/misc/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Miscellaneous Topics",
+  "position": 4
+}
diff --git a/examples/rendering/cornell_box.py b/examples/rendering/cornell_box.py
index 94e916733..9560b09ee 100644
--- a/examples/rendering/cornell_box.py
+++ b/examples/rendering/cornell_box.py
@@ -1,6 +1,7 @@
 import time
 
 import numpy as np
+from numpy.lib.function_base import average
 
 import taichi as ti
 
@@ -8,6 +9,7 @@
 res = (800, 800)
 color_buffer = ti.Vector.field(3, dtype=ti.f32, shape=res)
 count_var = ti.field(ti.i32, shape=(1, ))
+tonemapped_buffer = ti.Vector.field(3, dtype=ti.f32, shape=res)
 
 max_ray_depth = 10
 eps = 1e-4
@@ -482,18 +484,34 @@ def render():
     count_var[0] = (count_var[0] + 1) % (stratify_res * stratify_res)
 
 
-gui = ti.GUI('Cornell Box', res)
+@ti.kernel
+def tonemap(accumulated: ti.f32) -> ti.f32:
+    sum = 0.0
+    sum_sq = 0.0
+    for i, j in color_buffer:
+        luma = color_buffer[i, j][0] * 0.2126 + color_buffer[
+            i, j][1] * 0.7152 + color_buffer[i, j][2] * 0.0722
+        sum += luma
+        sum_sq += ti.pow(luma / accumulated, 2.0)
+    mean = sum / (res[0] * res[1])
+    var = sum_sq / (res[0] * res[1]) - ti.pow(mean / accumulated, 2.0)
+    for i, j in tonemapped_buffer:
+        tonemapped_buffer[i, j] = ti.sqrt(color_buffer[i, j] / mean * 0.6)
+    return var
+
+
+gui = ti.GUI('Cornell Box', res, fast_gui=True)
+gui.fps_limit = 300
 last_t = time.time()
 i = 0
 while gui.running:
     render()
     interval = 10
-    if i % interval == 0 and i > 0:
-        img = color_buffer.to_numpy() * (1 / (i + 1))
-        img = np.sqrt(img / img.mean() * 0.24)
+    if i % interval == 0:
+        var = tonemap(i)
         print("{:.2f} samples/s ({} iters, var={})".format(
-            interval / (time.time() - last_t), i, np.var(img)))
+            interval / (time.time() - last_t), i, var))
         last_t = time.time()
-        gui.set_image(img)
+        gui.set_image(tonemapped_buffer)
         gui.show()
     i += 1
diff --git a/examples/simulation/mpm128.py b/examples/simulation/mpm128.py
index a32bf420e..8dffdd7ff 100644
--- a/examples/simulation/mpm128.py
+++ b/examples/simulation/mpm128.py
@@ -148,7 +148,9 @@ def reset():
         attractor_strength[None] = -1
     for s in range(int(2e-3 // dt)):
         substep()
-    colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-    gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+    gui.circles(x.to_numpy(),
+                radius=1.5,
+                palette=[0x068587, 0xED553B, 0xEEEEF0],
+                palette_indices=material)
     gui.show(
     )  # Change to gui.show(f'{frame:06d}.png') to write images to disk
diff --git a/examples/simulation/mpm99.py b/examples/simulation/mpm99.py
index 9a9d8fe59..cf71c3877 100644
--- a/examples/simulation/mpm99.py
+++ b/examples/simulation/mpm99.py
@@ -122,7 +122,9 @@ def initialize():
 while not gui.get_event(ti.GUI.ESCAPE, ti.GUI.EXIT):
     for s in range(int(2e-3 // dt)):
         substep()
-    colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-    gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+    gui.circles(x.to_numpy(),
+                radius=1.5,
+                palette=[0x068587, 0xED553B, 0xEEEEF0],
+                palette_indices=material)
     gui.show(
     )  # Change to gui.show(f'{frame:06d}.png') to write images to disk
diff --git a/misc/ci_setup.py b/misc/ci_setup.py
index 24868c76e..1e27fa94e 100644
--- a/misc/ci_setup.py
+++ b/misc/ci_setup.py
@@ -257,12 +257,11 @@ def run(self):
             execute_command('echo $PYTHONPATH')
         elif get_os_name() != 'win':
             # compile ..
-            os.makedirs('build', exist_ok=True)
             arg = environ.get('CI_SETUP_CMAKE_ARGS', '')
+            os.makedirs('build', exist_ok=True)
             execute_command(
-                f'cd build && cmake .. -DPYTHON_EXECUTABLE={sys.executable} {arg}'
+                f'TAICHI_CMAKE_ARGS="{arg}" {sys.executable} setup.py install --user'
             )
-            execute_command('cd build && make -j 10')
         return
         if test_installation():
             print('  Successfully Installed Taichi at {}.'.format(
diff --git a/python/.gitignore b/python/.gitignore
index 7d9f12eb9..12389e8d4 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,5 +1,7 @@
 lib
 taichi.egg-info
 taichi/include
-setup.py
+taichi/examples
+taichi/assets
+taichi/tests
 release
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
deleted file mode 100644
index c8c81c5e8..000000000
--- a/python/MANIFEST.in
+++ /dev/null
@@ -1,12 +0,0 @@
-include MANIFEST.in
-include *.txt
-include *.py
-include *.cfg
-include taichi/*.md
-include taichi/assets/*
-include taichi/lib/*.so
-include taichi/lib/*.pyd
-include taichi/lib/*.bc
-
-global-exclude *.pyc *.pyo
-global-exclude ffmpeg
diff --git a/python/build.py b/python/build.py
index ffcd10a40..34a33a58c 100644
--- a/python/build.py
+++ b/python/build.py
@@ -5,8 +5,6 @@
 import shutil
 import sys
 
-import taichi as ti
-
 
 def get_os_name():
     name = platform.platform()
@@ -26,95 +24,33 @@ def get_python_executable():
 
 
 def build(project_name):
-    """Build and package the wheel file in `python/dist`"""
+    """Build and package the wheel file in root `dist` dir"""
     if platform.system() == 'Linux':
         if re.search("^clang\+\+-*\d*", str(os.environ.get('CXX'))) is None:
             raise RuntimeError(
                 'Only the wheel with clang will be released to PyPI')
-    version = ti.core.get_version_string()
-    with open('../setup.py') as fin:
-        with open('setup.py', 'w') as fout:
-            print("project_name = '{}'".format(project_name), file=fout)
-            print("version = '{}'".format(version), file=fout)
-            for l in fin:
-                print(l, file=fout, end='')
-
-    print("*** project_name = '{}'".format(project_name))
-
-    try:
-        os.remove('taichi/CHANGELOG.md')
-    except FileNotFoundError:
-        pass
-    shutil.rmtree('taichi/lib', ignore_errors=True)
-    shutil.rmtree('taichi/tests', ignore_errors=True)
-    shutil.rmtree('taichi/examples', ignore_errors=True)
-    shutil.rmtree('taichi/assets', ignore_errors=True)
-    os.makedirs('taichi/lib', exist_ok=True)
-    shutil.rmtree('build', ignore_errors=True)
-    shutil.rmtree('dist', ignore_errors=True)
-    shutil.rmtree('taichi/include', ignore_errors=True)
-    # shutil.copytree('../include/', 'taichi/include')
-    build_dir = '../build'
-
-    if get_os_name() == 'linux':
-        shutil.copy('../build/libtaichi_core.so', 'taichi/lib/taichi_core.so')
-    elif get_os_name() == 'osx':
-        shutil.copy('../build/libtaichi_core.dylib',
-                    'taichi/lib/taichi_core.so')
-    else:
-        shutil.copy('../runtimes/RelWithDebInfo/taichi_core.dll',
-                    'taichi/lib/taichi_core.pyd')
-
-    os.system(f'cd .. && {get_python_executable()} -m taichi changelog --save')
-
-    try:
-        with open('../CHANGELOG.md') as f:
-            print(f.read())
-    except FileNotFoundError:
-        print('CHANGELOG.md not found')
-        pass
-
-    try:
-        shutil.copy('../CHANGELOG.md', './taichi/CHANGELOG.md')
-    except FileNotFoundError:
-        pass
-    shutil.copytree('../tests/python', './taichi/tests')
-    shutil.copytree('../examples', './taichi/examples')
-    shutil.copytree('../external/assets', './taichi/assets')
-
-    if get_os_name() != 'osx':
-        libdevice_path = ti.core.libdevice_path()
-        print("copying libdevice:", libdevice_path)
-        assert os.path.exists(libdevice_path)
-        shutil.copy(libdevice_path, 'taichi/lib/slim_libdevice.10.bc')
-
-    ti.core.compile_runtimes()
-    runtime_dir = ti.core.get_runtime_dir()
-    for f in os.listdir(runtime_dir):
-        if f.startswith('runtime_') and f.endswith('.bc'):
-            print(f"Fetching runtime file {f}")
-            shutil.copy(os.path.join(runtime_dir, f), 'taichi/lib')
 
     print("Using python executable", get_python_executable())
     os.system(
         '{} -m pip install --user --upgrade twine setuptools wheel'.format(
             get_python_executable()))
 
+    os.system(f'{get_python_executable()} -m taichi changelog --save')
+
     if get_os_name() == 'linux':
-        os.system('{} setup.py bdist_wheel -p manylinux1_x86_64'.format(
-            get_python_executable()))
+        os.system(
+            f'cd ..; PROJECT_NAME={project_name} {get_python_executable()} setup.py bdist_wheel -p manylinux1_x86_64'
+        )
     else:
-        os.system('{} setup.py bdist_wheel'.format(get_python_executable()))
+        os.system(
+            f'cd ..; PROJECT_NAME={project_name} {get_python_executable()} setup.py bdist_wheel'
+        )
 
-    shutil.rmtree('taichi/lib')
-    shutil.rmtree('taichi/tests')
-    shutil.rmtree('taichi/examples')
-    shutil.rmtree('taichi/assets')
     try:
         os.remove('taichi/CHANGELOG.md')
     except FileNotFoundError:
         pass
-    shutil.rmtree('./build')
+    shutil.rmtree('../build')
 
 
 def parse_args():
@@ -149,6 +85,8 @@ def main():
 
     env_pypi_pwd = os.environ.get('PYPI_PWD', '')
 
+    shutil.rmtree('../dist', ignore_errors=True)
+
     if mode == 'try_upload':
         if env_pypi_pwd == '':
             print("Missing environment variable PYPI_PWD")
@@ -174,16 +112,18 @@ def main():
             get_python_executable(), pypi_repo, pypi_user))
     elif mode == 'test':
         print('Uninstalling old taichi packages...')
-        os.system(f'{get_python_executable()} -m pip uninstall taichi-nightly')
-        os.system(f'{get_python_executable()} -m pip uninstall taichi')
-        dists = os.listdir('dist')
+        os.system(
+            f'{get_python_executable()} -m pip uninstall -y taichi-nightly')
+        os.system(f'{get_python_executable()} -m pip uninstall -y taichi')
+        dists = os.listdir('../dist')
         assert len(dists) == 1
         dist = dists[0]
         print('Installing ', dist)
         os.environ['PYTHONPATH'] = ''
         os.makedirs('test_env', exist_ok=True)
-        os.system('cd test_env && {} -m pip install ../dist/{} --user'.format(
-            get_python_executable(), dist))
+        os.system(
+            'cd test_env && {} -m pip install ../../dist/{} --user'.format(
+                get_python_executable(), dist))
         print('Entering test environment...')
         if get_os_name() == 'win':
             os.system(
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 97678d9c0..dcfd0edec 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -404,7 +404,8 @@ def visit(node):
             from taichi.lang.meta import clear_gradients
             clear_gradients(places)
 
-    visit(ti.root)
+    for root_fb in FieldsBuilder.finalized_roots():
+        visit(root_fb)
 
 
 def benchmark(func, repeat=300, args=()):
diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py
index b999bc202..62386afe2 100644
--- a/python/taichi/lang/expr.py
+++ b/python/taichi/lang/expr.py
@@ -141,6 +141,10 @@ def snode(self):
     def __hash__(self):
         return self.ptr.get_raw_address()
 
+    @property
+    def name(self):
+        return self.snode.name
+
     @property
     def shape(self):
         if self.ptr.is_external_var():
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index e3d010be0..819b0e5aa 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -11,6 +11,7 @@
 from taichi.lang.util import (cook_dtype, is_taichi_class, python_scope,
                               taichi_scope)
 from taichi.misc.util import deprecated, get_traceback, warning
+from taichi.snode.fields_builder import FieldsBuilder
 
 import taichi as ti
 
@@ -127,6 +128,14 @@ def subscript(value, *indices):
                 raise TypeError(
                     'Subscription (e.g., "a[i, j]") only works on fields or external arrays.'
                 )
+            if not value.ptr.is_external_var() and value.ptr.snode() is None:
+                if not value.ptr.is_primal():
+                    raise RuntimeError(
+                        f"Gradient {value.ptr.get_expr_name()} has not been placed, check whether `needs_grad=True`"
+                    )
+                else:
+                    raise RuntimeError(
+                        f"{value.ptr.get_expr_name()} has not been placed.")
             field_dim = int(value.ptr.get_attribute("dim"))
         else:
             # When reading bit structure we only support the 0-D case for now.
@@ -237,16 +246,20 @@ def create_program(self):
         if self.prog is None:
             self.prog = _ti_core.Program()
 
+    def materialize_root_fb(self, first):
+        if (not root.finalized and not root.empty) or first:
+            root.finalize()
+
+        if root.finalized:
+            global _root_fb
+            _root_fb = FieldsBuilder()
+
     def materialize(self):
+        self.materialize_root_fb(not self.materialized)
+
         if self.materialized:
             return
 
-        print('[Taichi] materializing...')
-        self.create_program()
-
-        if not root.finalized:
-            root.finalize()
-
         self.materialized = True
         not_placed = []
         for var in self.global_vars:
@@ -424,7 +437,7 @@ def var(dt, shape=None, offset=None, needs_grad=False):
 
 
 @python_scope
-def field(dtype, shape=None, offset=None, needs_grad=False):
+def field(dtype, shape=None, name="", offset=None, needs_grad=False):
     _taichi_skip_traceback = 1
 
     dtype = cook_dtype(dtype)
@@ -449,6 +462,7 @@ def field(dtype, shape=None, offset=None, needs_grad=False):
     x = Expr(_ti_core.make_id_expr(""))
     x.declaration_tb = get_traceback(stacklevel=2)
     x.ptr = _ti_core.global_new(x.ptr, dtype)
+    x.ptr.set_name(name)
     x.ptr.set_is_primal(True)
     pytaichi.global_vars.append(x)
 
@@ -456,6 +470,7 @@ def field(dtype, shape=None, offset=None, needs_grad=False):
         # adjoint
         x_grad = Expr(_ti_core.make_id_expr(""))
         x_grad.ptr = _ti_core.global_new(x_grad.ptr, dtype)
+        x_grad.ptr.set_name(name + ".grad")
         x_grad.ptr.set_is_primal(False)
         x.set_grad(x_grad)
 
@@ -502,7 +517,13 @@ def vars2entries(vars):
             if hasattr(var, '__ti_repr__'):
                 res = var.__ti_repr__()
             elif isinstance(var, (list, tuple)):
-                res = list_ti_repr(var)
+                res = var
+                # If the first element is '__ti_format__', this list is the result of ti_format.
+                if len(var) > 0 and isinstance(
+                        var[0], str) and var[0] == '__ti_format__':
+                    res = var[1:]
+                else:
+                    res = list_ti_repr(var)
             else:
                 yield var
                 continue
@@ -537,6 +558,35 @@ def fused_string(entries):
     _ti_core.create_print(contentries)
 
 
+@taichi_scope
+def ti_format(*args):
+    content = args[0]
+    mixed = args[1:]
+    new_mixed = []
+    args = []
+    for x in mixed:
+        if isinstance(x, ti.Expr):
+            new_mixed.append('{}')
+            args.append(x)
+        else:
+            new_mixed.append(x)
+
+    try:
+        content = content.format(*new_mixed)
+    except ValueError:
+        print('Number formatting is not supported with Taichi fields')
+        exit(1)
+    res = content.split('{}')
+    assert len(res) == len(
+        args
+    ) + 1, 'Number of args is different from number of positions provided in string'
+
+    for i in range(len(args)):
+        res.insert(i * 2 + 1, args[i])
+    res.insert(0, '__ti_format__')
+    return res
+
+
 @taichi_scope
 def ti_assert(cond, msg, extra_args):
     # Mostly a wrapper to help us convert from Expr (defined in Python) to
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index eb2a07c38..2f648e4ec 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -351,8 +351,7 @@ def materialize(self, key=None, args=None, arg_features=None):
         _taichi_skip_traceback = 1
         if key is None:
             key = (self.func, 0)
-        if not self.runtime.materialized:
-            self.runtime.materialize()
+        self.runtime.materialize()
         if key in self.compiled_functions:
             return
         grad_suffix = ""
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 46187c54f..42362f1b8 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -585,6 +585,10 @@ def shape(self):
     def dim(self):
         return len(self.shape)
 
+    @property
+    def name(self):
+        return self.loop_range().name
+
     @property
     def dtype(self):
         return self.loop_range().dtype
@@ -807,6 +811,7 @@ def field(cls,
               m,
               dtype,
               shape=None,
+              name="",
               offset=None,
               needs_grad=False,
               layout=None):  # TODO(archibate): deprecate layout
@@ -825,17 +830,17 @@ def field(cls,
                     dtype
                 ) == n, f'Please set correct dtype list for Vector. The shape of dtype list should be ({n}, ) instead of {np.shape(dtype)}'
                 for i in range(n):
-                    self.entries.append(impl.field(dtype[i]))
+                    self.entries.append(impl.field(dtype[i], name=name))
             else:
                 assert len(np.shape(dtype)) == 2 and len(dtype) == n and len(
                     dtype[0]
                 ) == m, f'Please set correct dtype list for Matrix. The shape of dtype list should be ({n}, {m}) instead of {np.shape(dtype)}'
                 for i in range(n):
                     for j in range(m):
-                        self.entries.append(impl.field(dtype[i][j]))
+                        self.entries.append(impl.field(dtype[i][j], name=name))
         else:
             for _ in range(n * m):
-                self.entries.append(impl.field(dtype))
+                self.entries.append(impl.field(dtype, name=name))
         self.grad = self.make_grad()
 
         if layout is not None:
diff --git a/python/taichi/lang/meta.py b/python/taichi/lang/meta.py
index 0a20bb365..73250fd41 100644
--- a/python/taichi/lang/meta.py
+++ b/python/taichi/lang/meta.py
@@ -1,26 +1,28 @@
 from taichi.core import settings
 from taichi.lang import impl
 from taichi.lang.expr import Expr
+from taichi.lang.kernel_arguments import ext_arr, template
+from taichi.lang.kernel_impl import kernel
 
 import taichi as ti
 
 # A set of helper (meta)functions
 
 
-@ti.kernel
-def fill_tensor(tensor: ti.template(), val: ti.template()):
+@kernel
+def fill_tensor(tensor: template(), val: template()):
     for I in ti.grouped(tensor):
         tensor[I] = val
 
 
-@ti.kernel
-def tensor_to_ext_arr(tensor: ti.template(), arr: ti.ext_arr()):
+@kernel
+def tensor_to_ext_arr(tensor: template(), arr: ext_arr()):
     for I in ti.grouped(tensor):
         arr[I] = tensor[I]
 
 
-@ti.kernel
-def vector_to_fast_image(img: ti.template(), out: ti.ext_arr()):
+@kernel
+def vector_to_fast_image(img: template(), out: ext_arr()):
     # FIXME: Why is ``for i, j in img:`` slower than:
     for i, j in ti.ndrange(*img.shape):
         r, g, b = 0, 0, 0
@@ -45,8 +47,8 @@ def vector_to_fast_image(img: ti.template(), out: ti.ext_arr()):
             out[idx] = (b << 16) + (g << 8) + r + alpha
 
 
-@ti.kernel
-def tensor_to_image(tensor: ti.template(), arr: ti.ext_arr()):
+@kernel
+def tensor_to_image(tensor: template(), arr: ext_arr()):
     for I in ti.grouped(tensor):
         t = ti.cast(tensor[I], ti.f32)
         arr[I, 0] = t
@@ -54,8 +56,8 @@ def tensor_to_image(tensor: ti.template(), arr: ti.ext_arr()):
         arr[I, 2] = t
 
 
-@ti.kernel
-def vector_to_image(mat: ti.template(), arr: ti.ext_arr()):
+@kernel
+def vector_to_image(mat: template(), arr: ext_arr()):
     for I in ti.grouped(mat):
         for p in ti.static(range(mat.n)):
             arr[I, p] = ti.cast(mat[I][p], ti.f32)
@@ -63,21 +65,20 @@ def vector_to_image(mat: ti.template(), arr: ti.ext_arr()):
                 arr[I, 2] = 0
 
 
-@ti.kernel
-def tensor_to_tensor(tensor: ti.template(), other: ti.template()):
+@kernel
+def tensor_to_tensor(tensor: template(), other: template()):
     for I in ti.grouped(tensor):
         tensor[I] = other[I]
 
 
-@ti.kernel
-def ext_arr_to_tensor(arr: ti.ext_arr(), tensor: ti.template()):
+@kernel
+def ext_arr_to_tensor(arr: ext_arr(), tensor: template()):
     for I in ti.grouped(tensor):
         tensor[I] = arr[I]
 
 
-@ti.kernel
-def matrix_to_ext_arr(mat: ti.template(), arr: ti.ext_arr(),
-                      as_vector: ti.template()):
+@kernel
+def matrix_to_ext_arr(mat: template(), arr: ext_arr(), as_vector: template()):
     for I in ti.grouped(mat):
         for p in ti.static(range(mat.n)):
             for q in ti.static(range(mat.m)):
@@ -87,9 +88,8 @@ def matrix_to_ext_arr(mat: ti.template(), arr: ti.ext_arr(),
                     arr[I, p, q] = mat[I][p, q]
 
 
-@ti.kernel
-def ext_arr_to_matrix(arr: ti.ext_arr(), mat: ti.template(),
-                      as_vector: ti.template()):
+@kernel
+def ext_arr_to_matrix(arr: ext_arr(), mat: template(), as_vector: template()):
     for I in ti.grouped(mat):
         for p in ti.static(range(mat.n)):
             for q in ti.static(range(mat.m)):
@@ -99,36 +99,36 @@ def ext_arr_to_matrix(arr: ti.ext_arr(), mat: ti.template(),
                     mat[I][p, q] = arr[I, p, q]
 
 
-@ti.kernel
-def clear_gradients(vars: ti.template()):
+@kernel
+def clear_gradients(vars: template()):
     for I in ti.grouped(Expr(vars[0])):
         for s in ti.static(vars):
             Expr(s)[I] = 0
 
 
-@ti.kernel
-def clear_loss(l: ti.template()):
+@kernel
+def clear_loss(l: template()):
     # Using SNode writers would result in a forced sync, therefore we wrap these
     # writes into a kernel.
     l[None] = 0
     l.grad[None] = 1
 
 
-@ti.kernel
-def fill_matrix(mat: ti.template(), vals: ti.template()):
+@kernel
+def fill_matrix(mat: template(), vals: template()):
     for I in ti.grouped(mat):
         for p in ti.static(range(mat.n)):
             for q in ti.static(range(mat.m)):
                 mat[I][p, q] = vals[p][q]
 
 
-@ti.kernel
-def snode_deactivate(b: ti.template()):
+@kernel
+def snode_deactivate(b: template()):
     for I in ti.grouped(b):
         ti.deactivate(b, I)
 
 
-@ti.kernel
-def snode_deactivate_dynamic(b: ti.template()):
+@kernel
+def snode_deactivate_dynamic(b: template()):
     for I in ti.grouped(b.parent()):
         ti.deactivate(b, I)
diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py
index 6ea9fa84a..4ceac5579 100644
--- a/python/taichi/lang/ops.py
+++ b/python/taichi/lang/ops.py
@@ -526,7 +526,6 @@ def external_func_call(func, args=[], outputs=[]):
 
 
 def asm(source, inputs=[], outputs=[]):
-
     _ti_core.insert_external_func_call(0, source, make_expr_group(inputs),
                                        make_expr_group(outputs))
 
@@ -567,11 +566,11 @@ def rescale_index(a, b, I):
 
     """
     assert isinstance(a, Expr) and a.is_global(), \
-            f"first arguement must be a field"
+        f"first arguement must be a field"
     assert isinstance(b, Expr) and b.is_global(), \
-            f"second arguement must be a field"
+        f"second arguement must be a field"
     assert isinstance(I, matrix.Matrix) and not I.is_global(), \
-            f"third arguement must be a grouped index"
+        f"third arguement must be a grouped index"
     Ib = I.copy()
     for n in range(min(I.n, min(len(a.shape), len(b.shape)))):
         if a.shape[n] > b.shape[n]:
diff --git a/python/taichi/lang/snode.py b/python/taichi/lang/snode.py
index 7086d3f88..e782cbde2 100644
--- a/python/taichi/lang/snode.py
+++ b/python/taichi/lang/snode.py
@@ -140,6 +140,10 @@ def get_shape(self, i):
     def loop_range(self):
         return Expr(_ti_core.global_var_expr_from_snode(self.ptr))
 
+    @property
+    def name(self):
+        return self.ptr.name()
+
     @deprecated('x.snode()', 'x.snode')
     def __call__(self):  # TODO: remove this after v0.7.0
         return self
diff --git a/python/taichi/lang/transformer.py b/python/taichi/lang/transformer.py
index a3a52b9ef..b256b8e92 100644
--- a/python/taichi/lang/transformer.py
+++ b/python/taichi/lang/transformer.py
@@ -4,6 +4,7 @@
 from taichi.lang import impl
 from taichi.lang.ast_resolver import ASTResolver
 from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.kernel_arguments import ext_arr, template
 from taichi.lang.util import to_taichi_type
 
 import taichi as ti
@@ -625,6 +626,11 @@ def visit_Call(self, node):
         if not ASTResolver.resolve_to(node.func, ti.static, globals()):
             # Do not apply the generic visitor if the function called is ti.static
             self.generic_visit(node)
+        if isinstance(node.func, ast.Attribute):
+            attr_name = node.func.attr
+            if attr_name == 'format':
+                node.args.insert(0, node.func.value)
+                node.func = self.parse_expr('ti.ti_format')
         if isinstance(node.func, ast.Name):
             func_name = node.func.id
             if func_name == 'print':
@@ -679,9 +685,9 @@ def transform_as_kernel():
             for i, arg in enumerate(args.args):
                 # Directly pass in template arguments,
                 # such as class instances ("self"), fields, SNodes, etc.
-                if isinstance(self.func.argument_annotations[i], ti.template):
+                if isinstance(self.func.argument_annotations[i], template):
                     continue
-                if isinstance(self.func.argument_annotations[i], ti.ext_arr):
+                if isinstance(self.func.argument_annotations[i], ext_arr):
                     arg_init = self.parse_stmt(
                         'x = ti.lang.kernel_arguments.decl_ext_arr_arg(0, 0)')
                     arg_init.targets[0].id = arg.arg
@@ -725,8 +731,7 @@ def transform_as_kernel():
                 for i, arg in enumerate(args.args):
                     # Directly pass in template arguments,
                     # such as class instances ("self"), fields, SNodes, etc.
-                    if isinstance(self.func.argument_annotations[i],
-                                  ti.template):
+                    if isinstance(self.func.argument_annotations[i], template):
                         continue
                     # Create a copy for non-template arguments,
                     # so that they are passed by value.
diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py
index 073f8c280..deb69e9b1 100644
--- a/python/taichi/misc/gui.py
+++ b/python/taichi/misc/gui.py
@@ -210,7 +210,12 @@ def set_image(self, img):
     def circle(self, pos, color=0xFFFFFF, radius=1):
         self.canvas.circle_single(pos[0], pos[1], color, radius)
 
-    def circles(self, pos, color=0xFFFFFF, radius=1):
+    def circles(self,
+                pos,
+                radius=1,
+                color=0xFFFFFF,
+                palette=None,
+                palette_indices=None):
         n = pos.shape[0]
         if len(pos.shape) == 3:
             assert pos.shape[2] == 1
@@ -235,6 +240,40 @@ def circles(self, pos, color=0xFFFFFF, radius=1):
             raise ValueError(
                 'Color must be an ndarray or int (e.g., 0x956333)')
 
+        if palette is not None:
+            assert palette_indices is not None, 'palette must be used together with palette_indices'
+
+            from taichi.lang.expr import Expr
+
+            if isinstance(palette_indices, Expr):
+                ind_int = palette_indices.to_numpy().astype(np.uint32)
+            elif isinstance(palette_indices, list) or isinstance(
+                    palette_indices, np.ndarray):
+                ind_int = np.array(palette_indices).astype(np.uint32)
+            else:
+                try:
+                    ind_int = np.array(palette_indices)
+                except:
+                    raise TypeError(
+                        'palette_indices must be a type that can be converted to numpy.ndarray'
+                    )
+
+            assert issubclass(
+                ind_int.dtype.type,
+                np.integer), 'palette_indices must be an integer array'
+            assert ind_int.shape == (
+                n,
+            ), 'palette_indices must be in 1-d shape with shape (num_particles, )'
+            assert min(
+                ind_int
+            ) >= 0, 'the min of palette_indices must not be less than zero'
+            assert max(ind_int) < len(
+                palette
+            ), 'the max of palette_indices must not exceed the length of palette'
+            color_array = np.array(palette, dtype=np.uint32)[ind_int]
+            color_array = np.ascontiguousarray(color_array)
+            color_array = color_array.ctypes.data
+
         if isinstance(radius, np.ndarray):
             assert radius.shape == (n, )
             radius = np.ascontiguousarray(radius.astype(np.float32))
diff --git a/python/taichi/snode/fields_builder.py b/python/taichi/snode/fields_builder.py
index 41d688ae8..f87209d1c 100644
--- a/python/taichi/snode/fields_builder.py
+++ b/python/taichi/snode/fields_builder.py
@@ -5,6 +5,7 @@
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl, snode
 from taichi.lang.exception import InvalidOperationError
+from taichi.misc.util import warning
 
 _snode_registry = _ti_core.SNodeRegistry()
 
@@ -37,6 +38,16 @@ def __init__(self):
         self._ptr = _snode_registry.create_root()
         self._root = snode.SNode(self._ptr)
         self._finalized = False
+        self._empty = True
+
+    @classmethod
+    def finalized_roots(cls):
+        roots_ptr = []
+        size = impl.get_runtime().prog.get_snode_tree_size()
+        for i in range(size):
+            res = impl.get_runtime().prog.get_snode_root(i)
+            roots_ptr.append(snode.SNode(res))
+        return roots_ptr
 
     @property
     def ptr(self):
@@ -46,16 +57,26 @@ def ptr(self):
     def root(self):
         return self._root
 
+    @property
+    def empty(self):
+        return self._empty
+
+    @property
+    def finalized(self):
+        return self._finalized
+
     def dense(self, indices: Union[Sequence[_Axis], _Axis],
               dimensions: Union[Sequence[int], int]):
         """Same as :func:`taichi.SNode.dense`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.dense(indices, dimensions)
 
     def pointer(self, indices: Union[Sequence[_Axis], _Axis],
                 dimensions: Union[Sequence[int], int]):
         """Same as :func:`taichi.SNode.pointer`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.pointer(indices, dimensions)
 
     def hash(self, indices, dimensions):
@@ -67,23 +88,27 @@ def dynamic(self,
                 chunk_size: Optional[int] = None):
         """Same as :func:`taichi.SNode.dynamic`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.dynamic(index, dimension, chunk_size)
 
     def bitmasked(self, indices: Union[Sequence[_Axis], _Axis],
                   dimensions: Union[Sequence[int], int]):
         """Same as :func:`taichi.SNode.bitmasked`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.bitmasked(indices, dimensions)
 
     def bit_struct(self, num_bits: int):
         """Same as :func:`taichi.SNode.bit_struct`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.bit_struct(num_bits)
 
     def bit_array(self, indices: Union[Sequence[_Axis], _Axis],
                   dimensions: Union[Sequence[int], int], num_bits: int):
         """Same as :func:`taichi.SNode.bit_array`"""
         self._check_not_finalized()
+        self._empty = False
         return self._root.bit_array(indices, dimensions, num_bits)
 
     def place(self,
@@ -92,17 +117,21 @@ def place(self,
               shared_exponent: bool = False):
         """Same as :func:`taichi.SNode.place`"""
         self._check_not_finalized()
+        self._empty = False
         self._root.place(*args, offset=offset, shared_exponent=shared_exponent)
 
     def lazy_grad(self):
         """Same as :func:`taichi.SNode.lazy_grad`"""
         # TODO: This complicates the implementation. Figure out why we need this
         self._check_not_finalized()
+        self._empty = False
         self._root.lazy_grad()
 
     def finalize(self):
         """Constructs the SNodeTree and finalizes this builder."""
         self._check_not_finalized()
+        if self._empty:
+            warning("Finalizing an empty FieldsBuilder!")
         _ti_core.finalize_snode_tree(_snode_registry, self._ptr,
                                      impl.get_runtime().prog)
         self._finalized = True
@@ -110,7 +139,3 @@ def finalize(self):
     def _check_not_finalized(self):
         if self._finalized:
             raise InvalidOperationError('FieldsBuilder finalized')
-
-    @property
-    def finalized(self):
-        return self._finalized
diff --git a/python/taichi/torch_io.py b/python/taichi/torch_io.py
index 30794729a..8a800a151 100644
--- a/python/taichi/torch_io.py
+++ b/python/taichi/torch_io.py
@@ -1,14 +1,15 @@
-import taichi as ti
+from taichi.lang.kernel_arguments import ext_arr, template
+from taichi.lang.kernel_impl import kernel
 
 
-@ti.kernel
-def from_torch_template(expr: ti.template(), torch_tensor: ti.ext_arr()):
+@kernel
+def from_torch_template(expr: template(), torch_tensor: ext_arr()):
     for i in expr:
         expr[i] = torch_tensor[i]
 
 
-@ti.kernel
-def to_torch_template(expr: ti.template(), torch_tensor: ti.ext_arr()):
+@kernel
+def to_torch_template(expr: template(), torch_tensor: ext_arr()):
     for i in expr:
         torch_tensor[i] = expr[i]
 
diff --git a/python/setup.cfg b/setup.cfg
similarity index 100%
rename from python/setup.cfg
rename to setup.cfg
diff --git a/setup.py b/setup.py
index cad2a6d6d..06f5a41f7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,15 @@
 import glob
+import multiprocessing
+import os
+import platform
+import shutil
+import subprocess
+import sys
 
-import setuptools
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+from setuptools.command.build_py import build_py
+from setuptools.command.egg_info import egg_info
 
 classifiers = [
     'Development Status :: 2 - Pre-Alpha',
@@ -16,35 +25,187 @@
     'Programming Language :: Python :: 3.9',
 ]
 
+project_name = os.getenv('PROJECT_NAME', 'taichi')
+TI_VERSION_MAJOR = 0
+TI_VERSION_MINOR = 7
+TI_VERSION_PATCH = 27
+version = f'{TI_VERSION_MAJOR}.{TI_VERSION_MINOR}.{TI_VERSION_PATCH}'
+
 data_files = glob.glob('python/lib/*')
 print(data_files)
-packages = setuptools.find_packages() + ['taichi.examples']
+packages = find_packages('python')
 print(packages)
 
-setuptools.setup(name=project_name,
-                 packages=packages,
-                 version=version,
-                 description='The Taichi Programming Language',
-                 author='Taichi developers',
-                 author_email='yuanmhu@gmail.com',
-                 url='https://github.com/taichi-dev/taichi',
-                 install_requires=[
-                     'numpy',
-                     'pybind11>=2.5.0',
-                     'sourceinspect>=0.0.4',
-                     'colorama',
-                     'astor',
-                 ],
-                 data_files=[('lib', data_files)],
-                 keywords=['graphics', 'simulation'],
-                 license='MIT',
-                 include_package_data=True,
-                 entry_points={
-                     'console_scripts': [
-                         'ti=taichi.main:main',
-                     ],
-                 },
-                 classifiers=classifiers,
-                 has_ext_modules=lambda: True)
-
-# Note: this is a template setup.py used by python/build.py
+# Our python package root dir is python/
+package_dir = 'python'
+
+
+def get_python_executable():
+    return sys.executable.replace('\\', '/')
+
+
+def get_os_name():
+    name = platform.platform()
+    # in python 3.8, platform.platform() uses mac_ver() on macOS
+    # it will return 'macOS-XXXX' instead of 'Darwin-XXXX'
+    if name.lower().startswith('darwin') or name.lower().startswith('macos'):
+        return 'osx'
+    elif name.lower().startswith('windows'):
+        return 'win'
+    elif name.lower().startswith('linux'):
+        return 'linux'
+    assert False, "Unknown platform name %s" % name
+
+
+def remove_tmp(taichi_dir):
+    shutil.rmtree(os.path.join(taichi_dir, 'assets'), ignore_errors=True)
+    shutil.rmtree(os.path.join(taichi_dir, 'examples'), ignore_errors=True)
+    shutil.rmtree(os.path.join(taichi_dir, 'tests'), ignore_errors=True)
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name):
+        Extension.__init__(self, name, sources=[])
+
+
+class EggInfo(egg_info):
+    def run(self):
+        taichi_dir = os.path.join(package_dir, 'taichi')
+        remove_tmp(taichi_dir)
+        shutil.rmtree('build', ignore_errors=True)
+
+        shutil.copytree('tests/python', os.path.join(taichi_dir, 'tests'))
+        shutil.copytree('examples', os.path.join(taichi_dir, 'examples'))
+        shutil.copytree('external/assets', os.path.join(taichi_dir, 'assets'))
+
+        egg_info.run(self)
+
+
+# python setup.py build runs the following commands in order:
+#   python setup.py build_py
+#   python setup.py build_ext
+class BuildPy(build_py):
+    def run(self):
+        build_py.run(self)
+        taichi_dir = os.path.join(package_dir, 'taichi')
+        remove_tmp(taichi_dir)
+
+
+class CMakeBuild(build_ext):
+    def parse_cmake_args_from_env(self):
+        # Source: TAICHI_CMAKE_ARGS=... python setup.py ...
+        cmake_args = os.getenv('TAICHI_CMAKE_ARGS', '')
+        return cmake_args.strip().split()
+
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError(
+                "CMake must be installed to build the following extensions: " +
+                ", ".join(e.name for e in self.extensions))
+
+        # CMakeLists.txt is in the same directory as this setup.py file
+        cmake_list_dir = os.path.abspath(os.path.dirname(__file__))
+        self.build_temp = os.path.join(cmake_list_dir, 'build')
+
+        build_directory = os.path.abspath(self.build_temp)
+
+        cmake_args = self.parse_cmake_args_from_env()
+
+        cmake_args += [
+            f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={build_directory}',
+            f'-DPYTHON_EXECUTABLE={get_python_executable()}',
+            f'-DTI_VERSION_MAJOR={TI_VERSION_MAJOR}',
+            f'-DTI_VERSION_MINOR={TI_VERSION_MINOR}',
+            f'-DTI_VERSION_PATCH={TI_VERSION_PATCH}',
+        ]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+
+        # Assuming Makefiles
+        build_args += ['--', f'-j{multiprocessing.cpu_count()}']
+
+        self.build_args = build_args
+
+        env = os.environ.copy()
+        os.makedirs(self.build_temp, exist_ok=True)
+
+        print('-' * 10, 'Running CMake prepare', '-' * 40)
+        subprocess.check_call(['cmake', cmake_list_dir] + cmake_args,
+                              cwd=self.build_temp,
+                              env=env)
+
+        print('-' * 10, 'Building extensions', '-' * 40)
+        cmake_cmd = ['cmake', '--build', '.'] + self.build_args
+        subprocess.check_call(cmake_cmd, cwd=self.build_temp)
+
+        self.prepare_package()
+
+    def prepare_package(self):
+        # We need to make sure these additional files are ready for
+        #   - develop mode: must exist in local python/taichi/lib/ folder
+        #   - install mode: must exist in self.build_lib/taichi/lib
+        taichi_lib_dir = 'taichi/lib'
+        for target in (
+                os.path.join(package_dir, taichi_lib_dir),
+                os.path.join(self.build_lib, taichi_lib_dir),
+        ):
+            shutil.rmtree(target, ignore_errors=True)
+            os.makedirs(target)
+            if get_os_name() == 'linux':
+                shutil.copy(os.path.join(self.build_temp, 'libtaichi_core.so'),
+                            os.path.join(target, 'taichi_core.so'))
+            elif get_os_name() == 'osx':
+                shutil.copy(
+                    os.path.join(self.build_temp, 'libtaichi_core.dylib'),
+                    os.path.join(target, 'taichi_core.so'))
+            else:
+                shutil.copy('../runtimes/RelWithDebInfo/taichi_core.dll',
+                            os.path.join(target, 'taichi_core.pyd'))
+
+            if get_os_name() != 'osx':
+                libdevice_path = 'external/cuda_libdevice/slim_libdevice.10.bc'
+                print("copying libdevice:", libdevice_path)
+                assert os.path.exists(libdevice_path)
+                shutil.copy(libdevice_path,
+                            os.path.join(target, 'slim_libdevice.10.bc'))
+
+            llvm_runtime_dir = 'taichi/runtime/llvm'
+            for f in os.listdir(llvm_runtime_dir):
+                if f.startswith('runtime_') and f.endswith('.bc'):
+                    print(f"Fetching runtime file {f} to {target} folder")
+                    shutil.copy(os.path.join(llvm_runtime_dir, f), target)
+
+
+setup(name=project_name,
+      packages=packages,
+      package_dir={"": package_dir},
+      version=version,
+      description='The Taichi Programming Language',
+      author='Taichi developers',
+      author_email='yuanmhu@gmail.com',
+      url='https://github.com/taichi-dev/taichi',
+      install_requires=[
+          'numpy',
+          'pybind11>=2.5.0',
+          'sourceinspect>=0.0.4',
+          'colorama',
+          'astor',
+      ],
+      data_files=[('lib', data_files)],
+      keywords=['graphics', 'simulation'],
+      license='MIT',
+      include_package_data=True,
+      entry_points={
+          'console_scripts': [
+              'ti=taichi.main:main',
+          ],
+      },
+      classifiers=classifiers,
+      ext_modules=[CMakeExtension('taichi_core')],
+      cmdclass=dict(egg_info=EggInfo, build_py=BuildPy, build_ext=CMakeBuild),
+      has_ext_modules=lambda: True)
diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h
index 171e8463f..9b3347809 100644
--- a/taichi/backends/metal/shaders/runtime_structs.metal.h
+++ b/taichi/backends/metal/shaders/runtime_structs.metal.h
@@ -117,7 +117,7 @@ STR(
         int32_t start = 0;
         int32_t num_bits = 0;
         int32_t acc_offset = 0;
-        int32_t num_elements = 0;
+        int32_t num_elements_from_root = 0;
       };
 
       Extractor extractors[kTaichiMaxNumIndices];
diff --git a/taichi/backends/vulkan/codegen_vulkan.cpp b/taichi/backends/vulkan/codegen_vulkan.cpp
new file mode 100644
index 000000000..8a6b9f4bf
--- /dev/null
+++ b/taichi/backends/vulkan/codegen_vulkan.cpp
@@ -0,0 +1,795 @@
+#include "taichi/backends/vulkan/codegen_vulkan.h"
+
+#include <string>
+#include <vector>
+
+#include "taichi/program/program.h"
+#include "taichi/program/kernel.h"
+#include "taichi/ir/statements.h"
+#include "taichi/ir/ir.h"
+#include "taichi/util/line_appender.h"
+#include "taichi/backends/vulkan/kernel_utils.h"
+#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/backends/opengl/opengl_data_types.h"
+#include "taichi/ir/transforms.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+namespace {
+
+constexpr char kRootBufferName[] = "root_buffer";
+constexpr char kGlobalTmpsBufferName[] = "global_tmps_buffer";
+constexpr char kContextBufferName[] = "context_buffer";
+
+constexpr char kGlobalInvocationIDName[] = "int(gl_GlobalInvocationID.x)";
+constexpr char kLinearLoopIndexName[] = "linear_loop_idx_";
+
+constexpr int kMaxNumThreadsGridStrideLoop = 65536;
+
+#define TI_INSIDE_VULKAN_CODEGEN
+#include "taichi/backends/vulkan/shaders/atomics.glsl.h"
+#undef TI_INSIDE_VULKAN_CODEGEN
+
+using opengl::opengl_data_type_name;
+using BuffersEnum = TaskAttributes::Buffers;
+using BufferBind = TaskAttributes::BufferBind;
+
+std::string buffer_instance_name(BuffersEnum b) {
+  // https://www.khronos.org/opengl/wiki/Interface_Block_(GLSL)#Syntax
+  switch (b) {
+    case BuffersEnum::Root:
+      return kRootBufferName;
+    case BuffersEnum::GlobalTmps:
+      return kGlobalTmpsBufferName;
+    case BuffersEnum::Context:
+      return kContextBufferName;
+    default:
+      TI_NOT_IMPLEMENTED;
+      break;
+  }
+  return {};
+}
+
+std::string store_as_int_bits(const std::string &in, DataType dt) {
+  if (dt->is_primitive(PrimitiveTypeID::f32)) {
+    return fmt::format("floatBitsToInt({})", in);
+  }
+  return in;
+}
+
+std::string load_from_int_bits(const std::string &in, DataType dt) {
+  if (dt->is_primitive(PrimitiveTypeID::f32)) {
+    return fmt::format("intBitsToFloat({})", in);
+  }
+  return in;
+}
+
+std::string vk_data_address_shifter(const Stmt *s, DataType) {
+  // Hardcoded ">> 2" because we only support 32-bit for now.
+  return fmt::format("({} >> 2)", s->raw_name());
+}
+
+class TaskCodegen : public IRVisitor {
+ private:
+  enum class Section {
+    Headers,
+    Kernels,
+  };
+
+  static constexpr Section kAllSections[] = {
+      Section::Headers,
+      Section::Kernels,
+  };
+
+ public:
+  struct Params {
+    OffloadedStmt *task_ir;
+    const CompiledSNodeStructs *compiled_structs;
+    const KernelContextAttributes *ctx_attribs;
+    std::string ti_kernel_name;
+    int task_id_in_kernel;
+  };
+
+  explicit TaskCodegen(const Params &params)
+      : task_ir_(params.task_ir),
+        compiled_structs_(params.compiled_structs),
+        ctx_attribs_(params.ctx_attribs),
+        task_name_(fmt::format("{}_t{:02d}",
+                               params.ti_kernel_name,
+                               params.task_id_in_kernel)) {
+    allow_undefined_visitor = true;
+    invoke_default_visitor = true;
+  }
+
+  struct Result {
+    std::string source_code;
+    TaskAttributes task_attribs;
+  };
+
+  Result run() {
+    code_section_ = Section::Kernels;
+    if (task_ir_->task_type == OffloadedTaskType::serial) {
+      generate_serial_kernel(task_ir_);
+    } else if (task_ir_->task_type == OffloadedTaskType::range_for) {
+      // struct_for is automatically lowered to ranged_for for dense snodes
+      generate_range_for_kernel(task_ir_);
+    } else {
+      TI_ERROR("Unsupported offload type={} on Vulkan arch",
+               task_ir_->task_name());
+    }
+    // Headers need global information, so it has to be delayed after visiting
+    // the task IR.
+    emit_headers();
+
+    Result res;
+    for (const auto s : kAllSections) {
+      res.source_code += section_appenders_.find(s)->second.lines();
+      res.source_code += '\n';
+    }
+    res.task_attribs = std::move(task_attribs_);
+    return res;
+  }
+
+  void visit(OffloadedStmt *) override {
+    TI_ERROR("This codegen is supposed to deal with one offloaded task");
+  }
+
+  void visit(Block *stmt) override {
+    push_indent();
+    for (auto &s : stmt->statements) {
+      s->accept(this);
+    }
+    pop_indent();
+  }
+
+  void visit(ConstStmt *const_stmt) override {
+    TI_ASSERT(const_stmt->width() == 1);
+    emit("const {} {} = {};", opengl_data_type_name(const_stmt->element_type()),
+         const_stmt->raw_name(), const_stmt->val[0].stringify());
+  }
+
+  void visit(AllocaStmt *alloca) override {
+    emit("{} {} = 0;", opengl_data_type_name(alloca->element_type()),
+         alloca->raw_name());
+  }
+
+  void visit(LocalLoadStmt *stmt) override {
+    // TODO: optimize for partially vectorized load...
+    bool linear_index = true;
+    for (int i = 0; i < (int)stmt->src.size(); i++) {
+      if (stmt->src[i].offset != i) {
+        linear_index = false;
+      }
+    }
+    if (stmt->same_source() && linear_index &&
+        stmt->width() == stmt->src[0].var->width()) {
+      auto ptr = stmt->src[0].var;
+      emit("const {} {} = {};", opengl_data_type_name(stmt->element_type()),
+           stmt->raw_name(), ptr->raw_name());
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+
+  void visit(LocalStoreStmt *stmt) override {
+    emit("{} = {};", stmt->dest->raw_name(), stmt->val->raw_name());
+  }
+
+  void visit(GetRootStmt *stmt) override {
+    // Should we assert |root_stmt_| is assigned only once?
+    root_stmt_ = stmt;
+    emit("const int {} = 0;", stmt->raw_name());
+  }
+
+  void visit(GetChStmt *stmt) override {
+    // TODO: GetChStmt -> GetComponentStmt ?
+    const auto &snode_descs = compiled_structs_->snode_descriptors;
+    auto *out_snode = stmt->output_snode;
+    TI_ASSERT(snode_descs.at(stmt->input_snode->id).get_child(stmt->chid) ==
+              out_snode);
+
+    emit("// SNode: {} -> {}", stmt->input_snode->node_type_name,
+         out_snode->node_type_name);
+    emit("const int {} = {} + {};", stmt->raw_name(),
+         stmt->input_ptr->raw_name(),
+         snode_descs.at(out_snode->id).mem_offset_in_parent_cell);
+    if (out_snode->is_place()) {
+      TI_ASSERT(ptr_to_buffers_.count(stmt) == 0);
+      ptr_to_buffers_[stmt] = BuffersEnum::Root;
+    }
+  }
+
+  void visit(SNodeLookupStmt *stmt) override {
+    // TODO: SNodeLookupStmt -> GetSNodeCellStmt ?
+    std::string parent;
+    if (stmt->input_snode) {
+      parent = stmt->input_snode->raw_name();
+    } else {
+      TI_ASSERT(root_stmt_ != nullptr);
+      parent = root_stmt_->raw_name();
+    }
+    const auto *sn = stmt->snode;
+
+    if (stmt->activate && !(sn->type == SNodeType::dense)) {
+      // Sparse SNode not supported yet.
+      TI_NOT_IMPLEMENTED;
+    }
+    const auto &snode_descs = compiled_structs_->snode_descriptors;
+    emit("// Get the cell of SNode {}", sn->node_type_name);
+    emit("const int {} = {} + ({} * {});", stmt->raw_name(), parent,
+         stmt->input_index->raw_name(), snode_descs.at(sn->id).cell_stride);
+  }
+
+  void visit(LinearizeStmt *stmt) override {
+    std::string val = "0";
+    for (int i = 0; i < (int)stmt->inputs.size(); i++) {
+      val = fmt::format("({} * {} + {})", val, stmt->strides[i],
+                        stmt->inputs[i]->raw_name());
+    }
+    emit("const int {} = {};", stmt->raw_name(), val);
+  }
+
+  void visit(BitExtractStmt *stmt) override {
+    emit("const int {} = (({} >> {}) & ((1 << {}) - 1));", stmt->raw_name(),
+         stmt->input->raw_name(), stmt->bit_begin,
+         stmt->bit_end - stmt->bit_begin);
+  }
+
+  void visit(LoopIndexStmt *stmt) override {
+    const auto stmt_name = stmt->raw_name();
+    if (stmt->loop->is<OffloadedStmt>()) {
+      const auto type = stmt->loop->as<OffloadedStmt>()->task_type;
+      if (type == OffloadedTaskType::range_for) {
+        TI_ASSERT(stmt->index == 0);
+        emit("const int {} = {};", stmt_name, kLinearLoopIndexName);
+      } else {
+        TI_NOT_IMPLEMENTED;
+      }
+    } else if (stmt->loop->is<RangeForStmt>()) {
+      TI_ASSERT(stmt->index == 0);
+      emit("const int {} = {};", stmt_name, stmt->loop->raw_name());
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+
+  void visit(GlobalStoreStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    const auto dt = stmt->val->element_type();
+    emit("{} = {};", at_buffer(stmt->dest, dt),
+         store_as_int_bits(stmt->val->raw_name(), dt));
+  }
+
+  void visit(GlobalLoadStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    auto dt = stmt->element_type();
+    const auto loaded_int = at_buffer(stmt->src, dt);
+    emit("const {} {} = {};", opengl_data_type_name(dt), stmt->raw_name(),
+         load_from_int_bits(loaded_int, dt));
+  }
+
+  void visit(ArgLoadStmt *stmt) override {
+    const auto arg_id = stmt->arg_id;
+    const auto &arg_attribs = ctx_attribs_->args()[arg_id];
+    const auto offset_in_mem = arg_attribs.offset_in_mem;
+    if (stmt->is_ptr) {
+      emit("// Pointer arg: id={} offset_in_mem={}", arg_id, offset_in_mem);
+      // Do not shift! We are indexing the buffers at byte granularity.
+      emit("const int {} = {};", stmt->raw_name(), offset_in_mem);
+    } else {
+      const auto dt = arg_attribs.dt;
+      const auto val_str = fmt::format("{}[{}]", kContextBufferName,
+                                       (offset_in_mem / sizeof(int32_t)));
+      emit("// Scalar arg: id={} offset_in_mem={}", arg_id, offset_in_mem);
+      emit("const {} {} = {};", opengl_data_type_name(dt), stmt->raw_name(),
+           load_from_int_bits(val_str, dt));
+    }
+  }
+
+  void visit(ReturnStmt *stmt) override {
+    // TODO: use stmt->ret_id instead of 0 as index
+    const auto &ret_attribs = ctx_attribs_->rets()[0];
+    const int index_in_buffer = ret_attribs.offset_in_mem / sizeof(int32_t);
+    emit("// Return value: offset_in_mem={}", ret_attribs.offset_in_mem);
+    emit("{}[{}] = {};", kContextBufferName, index_in_buffer,
+         store_as_int_bits(stmt->value->raw_name(), ret_attribs.dt));
+  }
+
+  void visit(GlobalTemporaryStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    const auto dt = opengl_data_type_name(stmt->element_type().ptr_removed());
+    emit("const int {} = {}", stmt->raw_name(), stmt->offset);
+    ptr_to_buffers_[stmt] = BuffersEnum::GlobalTmps;
+  }
+
+  void visit(ExternalPtrStmt *stmt) override {
+    // Used mostly for transferring data between host (e.g. numpy array) and
+    // Vulkan.
+    TI_ASSERT(stmt->width() == 1);
+    const auto linear_offset_name =
+        fmt::format("{}_linear_mem_offset_", stmt->raw_name());
+    emit("int {} = 0;", linear_offset_name);
+    emit("{{");
+    {
+      ScopedIndent s(current_appender());
+      const auto *argload = stmt->base_ptrs[0]->as<ArgLoadStmt>();
+      const int arg_id = argload->arg_id;
+      const int num_indices = stmt->indices.size();
+      std::vector<std::string> size_var_names;
+      const auto extra_args_mem_offset = ctx_attribs_->extra_args_mem_offset();
+      const auto extra_args_index_base =
+          (extra_args_mem_offset / sizeof(int32_t));
+      emit("// External ptr, extra args: mem_offset={} index_base={}",
+           extra_args_mem_offset, extra_args_index_base);
+      for (int i = 0; i < num_indices; i++) {
+        std::string var_name = fmt::format("{}_size{}_", stmt->raw_name(), i);
+        const auto extra_arg_linear_index_offset =
+            (arg_id * taichi_max_num_indices) + i;
+        const auto extra_arg_linear_index =
+            extra_args_index_base + extra_arg_linear_index_offset;
+        emit("// Extra arg: arg_id={} i={} linear_index=({} + {})={}", arg_id,
+             i, extra_args_index_base, extra_arg_linear_index_offset,
+             extra_arg_linear_index);
+        emit("const int {} = {}[{}];", var_name, kContextBufferName,
+             extra_arg_linear_index);
+        size_var_names.push_back(std::move(var_name));
+      }
+      for (int i = 0; i < num_indices; i++) {
+        emit("{} *= {};", linear_offset_name, size_var_names[i]);
+        emit("{} += {};", linear_offset_name, stmt->indices[i]->raw_name());
+      }
+      emit("// Convert index to bytes");
+      emit("{} = ({} << 2);", linear_offset_name, linear_offset_name);
+    }
+    emit("}}");
+    emit("const int {} = ({} + {});", stmt->raw_name(),
+         stmt->base_ptrs[0]->raw_name(), linear_offset_name);
+    ptr_to_buffers_[stmt] = BuffersEnum::Context;
+  }
+
+  void visit(UnaryOpStmt *stmt) override {
+    const auto dt_name = opengl_data_type_name(stmt->element_type());
+    const auto var_decl = fmt::format("const {} {}", dt_name, stmt->raw_name());
+    const auto operand_name = stmt->operand->raw_name();
+
+    if (stmt->op_type == UnaryOpType::logic_not) {
+      emit("{} = {}({} == 0);", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::neg) {
+      emit("{} = -{}({});", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::rsqrt) {
+      emit("{} = {}(inversesqrt({}));", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::sgn) {
+      emit("{} = {}(sign({}));", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::bit_not) {
+      emit("{} = ~{}({});", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::cast_value) {
+      emit("{} = {}({});", var_decl, dt_name, operand_name);
+    } else if (stmt->op_type == UnaryOpType::cast_bits) {
+      constexpr int kFloatingPoint = 0;
+      constexpr int kSignedInteger = 1;
+      constexpr int kUnsignedInteger = 2;
+
+      const auto dst_type = stmt->cast_type;
+      const auto src_type = stmt->operand->element_type();
+      auto dst_type_id = kFloatingPoint;
+      if (is_integral(dst_type)) {
+        dst_type_id = is_unsigned(dst_type) ? kUnsignedInteger : kSignedInteger;
+      }
+      auto src_type_id = kFloatingPoint;
+      if (is_integral(src_type)) {
+        src_type_id = is_unsigned(src_type) ? kUnsignedInteger : kSignedInteger;
+      }
+
+      TI_ASSERT_INFO(
+          data_type_size(dst_type) == data_type_size(src_type),
+          "bit_cast is only supported between data type with same size");
+
+      if (dst_type_id != kFloatingPoint && src_type_id != kFloatingPoint) {
+        emit("{} = {}({});", var_decl, dt_name, operand_name);
+      } else if (dst_type_id == kFloatingPoint &&
+                 src_type_id == kSignedInteger) {
+        emit("{} = intBitsToFloat({});", var_decl, operand_name);
+      } else if (dst_type_id == kSignedInteger &&
+                 src_type_id == kFloatingPoint) {
+        emit("{} = floatBitsToInt({});", var_decl, operand_name);
+      } else if (dst_type_id == kFloatingPoint &&
+                 src_type_id == kUnsignedInteger) {
+        emit("{} = uintBitsToFloat({});", var_decl, operand_name);
+      } else if (dst_type_id == kUnsignedInteger &&
+                 src_type_id == kFloatingPoint) {
+        emit("{} = floatBitsToUint({});", var_decl, operand_name);
+      } else {
+        TI_ERROR("[glsl] unsupported bit cast from {} to {}",
+                 data_type_name(src_type), data_type_name(dst_type));
+      }
+    } else {
+      emit("{} = {}({});", var_decl, unary_op_type_name(stmt->op_type),
+           operand_name);
+    }
+  }
+
+  void visit(BinaryOpStmt *bin) override {
+    const auto dt_name = opengl_data_type_name(bin->element_type());
+    const auto lhs_name = bin->lhs->raw_name();
+    const auto rhs_name = bin->rhs->raw_name();
+    const auto bin_name = bin->raw_name();
+    const auto op_type = bin->op_type;
+    const auto var_decl = fmt::format("const {} {}", dt_name, bin_name);
+    if (op_type == BinaryOpType::floordiv) {
+      if (is_integral(bin->lhs->element_type()) &&
+          is_integral(bin->rhs->element_type())) {
+        emit(
+            "{} = {}(sign({}) * {} >= 0 ? abs({}) / abs({}) : "
+            "sign({}) * "
+            "(abs({}) + abs({}) - 1) / {});",
+            var_decl, dt_name, lhs_name, rhs_name, lhs_name, rhs_name, lhs_name,
+            lhs_name, rhs_name, rhs_name);
+      } else {
+        emit("{} = floor({} / {});", var_decl, lhs_name, rhs_name);
+      }
+      return;
+    }
+    if (bin->op_type == BinaryOpType::mod) {
+      // NOTE: the GLSL built-in function `mod()` is a pythonic mod: x - y *
+      // floor(x / y)
+      emit("{} = {} - {} * int({} / {});", var_decl, lhs_name, rhs_name,
+           lhs_name, rhs_name);
+      return;
+    }
+
+    const auto binop = binary_op_type_symbol(bin->op_type);
+    if (opengl::is_opengl_binary_op_infix(op_type)) {
+      if (is_comparison(op_type)) {
+        // TODO(#577): Taichi uses -1 as true due to LLVM i1.
+        emit(" {} = -{}({} {} {});", var_decl, dt_name, lhs_name, binop,
+             rhs_name);
+      } else {
+        emit("{} = {}({} {} {});", var_decl, dt_name, lhs_name, binop,
+             rhs_name);
+      }
+    } else {
+      // This is a function call
+      emit("{} = {}({}, {});", var_decl, binop, lhs_name, rhs_name);
+    }
+  }
+
+  void visit(TernaryOpStmt *tri) override {
+    TI_ASSERT(tri->op_type == TernaryOpType::select);
+    emit("const {} {} = ({}) ? ({}) : ({});",
+         opengl_data_type_name(tri->element_type()), tri->raw_name(),
+         tri->op1->raw_name(), tri->op2->raw_name(), tri->op3->raw_name());
+  }
+
+  void visit(AtomicOpStmt *stmt) override {
+    TI_ASSERT(stmt->width() == 1);
+    if (stmt->op_type != AtomicOpType::add) {
+      TI_NOT_IMPLEMENTED;
+    }
+    const auto dt = stmt->dest->element_type().ptr_removed();
+    std::string func = "atomicAdd";  // GLSL builtin
+    std::string mem = at_buffer(stmt->dest, dt);
+    if (dt->is_primitive(PrimitiveTypeID::f32)) {
+      // Buffer has to be specified in the fatomicAdd helpers.
+      const std::string buffer_name =
+          buffer_instance_name(ptr_to_buffers_.at(stmt->dest));
+      func = fmt::format("fatomicAdd_{}", buffer_name);
+      mem = vk_data_address_shifter(stmt->dest, dt);
+    } else if (!is_integral(dt)) {
+      TI_ERROR("Vulkan only supports 32-bit atomic data types");
+    }
+    // const dt stmt = atomicAdd(mem, val);
+    emit("const {} {} = {}({}, {});", opengl_data_type_name(dt),
+         stmt->raw_name(), func, mem, stmt->val->raw_name());
+  }
+
+  void visit(IfStmt *if_stmt) override {
+    emit("if ({} != 0) {{", if_stmt->cond->raw_name());
+    if (if_stmt->true_statements) {
+      if_stmt->true_statements->accept(this);
+    }
+    emit("}} else {{");
+    if (if_stmt->false_statements) {
+      if_stmt->false_statements->accept(this);
+    }
+    emit("}}");
+  }
+
+  void visit(RangeForStmt *for_stmt) override {
+    TI_ASSERT(for_stmt->width() == 1);
+    auto loop_var_name = for_stmt->raw_name();
+    if (!for_stmt->reversed) {
+      emit("for (int {}_ = {}; {}_ < {}; {}_ = {}_ + {}) {{", loop_var_name,
+           for_stmt->begin->raw_name(), loop_var_name,
+           for_stmt->end->raw_name(), loop_var_name, loop_var_name, 1);
+      emit("  int {} = {}_;", loop_var_name, loop_var_name);
+    } else {
+      // reversed for loop
+      emit("for (int {}_ = {} - 1; {}_ >= {}; {}_ = {}_ - {}) {{",
+           loop_var_name, for_stmt->end->raw_name(), loop_var_name,
+           for_stmt->begin->raw_name(), loop_var_name, loop_var_name, 1);
+      emit("  int {} = {}_;", loop_var_name, loop_var_name);
+    }
+    for_stmt->body->accept(this);
+    emit("}}");
+  }
+
+  void visit(WhileStmt *stmt) override {
+    emit("while (true) {{");
+    stmt->body->accept(this);
+    emit("}}");
+  }
+
+  void visit(WhileControlStmt *stmt) override {
+    emit("if ({} == 0) break;", stmt->cond->raw_name());
+  }
+
+  void visit(ContinueStmt *stmt) override {
+    if (stmt->as_return()) {
+      emit("return;");
+    } else {
+      emit("continue;");
+    }
+  }
+
+ private:
+  void emit_headers() {
+    SectionGuard sg(this, Section::Headers);
+
+    emit("#version 450");
+    emit("layout(local_size_x={}, local_size_y=1, local_size_z=1) in;",
+         task_attribs_.advisory_num_threads_per_group);
+    emit("");
+    for (const auto &bb : task_attribs_.buffer_binds) {
+      // e.g.
+      // layout(std430, binding=0) buffer Root { int root_buffer[]; };
+      emit("layout(std430, binding={}) buffer {} {{ int {}[]; }};", bb.binding,
+           TaskAttributes::buffers_name(bb.type),
+           buffer_instance_name(bb.type));
+    }
+    emit("");
+    emit("// Helpers");
+    current_appender().append_raw(kVulkanAtomicsSourceCode);
+  }
+
+  void generate_serial_kernel(OffloadedStmt *stmt) {
+    task_attribs_.name = task_name_;
+    task_attribs_.task_type = OffloadedTaskType::serial;
+    task_attribs_.buffer_binds = get_common_buffer_binds();
+    task_attribs_.advisory_total_num_threads = 1;
+    task_attribs_.advisory_num_threads_per_group = 1;
+
+    const auto func_name = single_work_func_name();
+    // The computation for a single work is wrapped inside a function, so that
+    // we can do grid-strided loop.
+    emit_single_work_func_def(func_name, stmt->body.get());
+    // The actual compute kernel entry point.
+    emit("void main() {{");
+    {
+      ScopedIndent s(current_appender());
+      emit("// serial");
+      emit("if ({} > 0) return;", kGlobalInvocationIDName);
+
+      emit_call_single_work_func(func_name, /*loop_index_expr=*/"0");
+    }
+    // Close kernel
+    emit("}}\n");
+  }
+
+  void generate_range_for_kernel(OffloadedStmt *stmt) {
+    task_attribs_.name = task_name_;
+    task_attribs_.task_type = OffloadedTaskType::range_for;
+    task_attribs_.buffer_binds = get_common_buffer_binds();
+
+    task_attribs_.range_for_attribs = TaskAttributes::RangeForAttributes();
+    auto &range_for_attribs = task_attribs_.range_for_attribs.value();
+    range_for_attribs.const_begin = stmt->const_begin;
+    range_for_attribs.const_end = stmt->const_end;
+    range_for_attribs.begin =
+        (stmt->const_begin ? stmt->begin_value : stmt->begin_offset);
+    range_for_attribs.end =
+        (stmt->const_end ? stmt->end_value : stmt->end_offset);
+
+    const auto func_name = single_work_func_name();
+    emit_single_work_func_def(func_name, stmt->body.get());
+
+    emit("void main() {{");
+    push_indent();
+    const std::string total_elems_name("total_elems");
+    std::string begin_expr;
+    if (range_for_attribs.const_range()) {
+      const int num_elems = range_for_attribs.end - range_for_attribs.begin;
+      begin_expr = std::to_string(stmt->begin_value);
+      emit("// range_for, range known at compile time");
+      emit("const int {} = {};", total_elems_name, num_elems);
+      task_attribs_.advisory_total_num_threads = num_elems;
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+    // begin_ = thread_id   + begin_expr
+    emit("const int begin_ = {} + {};", kGlobalInvocationIDName, begin_expr);
+    // end_   = total_elems + begin_expr
+    emit("const int end_ = {} + {};", total_elems_name, begin_expr);
+    // For now, |total_invocs_name| is equal to |total_elems|. Once we support
+    // dynamic range, they will be different.
+    const std::string total_invocs_name = "total_invocs";
+    // https://www.khronos.org/opengl/wiki/Compute_Shader#Inputs
+    emit("const int {} = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);",
+         total_invocs_name);
+    // grid-strided loop
+    emit("for (int ii = begin_; ii < end_; ii += {}) {{", total_invocs_name);
+    {
+      ScopedIndent s2(current_appender());
+      emit_call_single_work_func(func_name, /*loop_index_expr=*/"ii");
+    }
+    emit("}}");  // closes for loop
+
+    pop_indent();
+    // Close kernel
+    emit("}}\n");
+    // TODO: runtime needs to verify if block_dim is feasible
+    task_attribs_.advisory_num_threads_per_group = stmt->block_dim;
+  }
+
+  void emit_single_work_func_def(const std::string &func_name,
+
+                                 Block *func_ir) {
+    emit("void {}(", func_name);
+    emit("    const int {}) {{", kLinearLoopIndexName);
+    // We do not need additional indentation, because |func_ir| itself is a
+    // block, which will be indented automatically.
+    func_ir->accept(this);
+    emit("}}\n");  // closes this function
+  }
+
+  void emit_call_single_work_func(const std::string &func_name,
+
+                                  const std::string &loop_index_expr) {
+    emit("{}({});", func_name, loop_index_expr);
+  }
+
+  std::string at_buffer(const Stmt *ptr, DataType dt) const {
+    const std::string buffer_name =
+        buffer_instance_name(ptr_to_buffers_.at(ptr));
+    return fmt::format("{}[{}]", buffer_name, vk_data_address_shifter(ptr, dt));
+  }
+
+  std::string single_work_func_name() const {
+    return task_name_ + "_func";
+  }
+
+  std::vector<BufferBind> get_common_buffer_binds() const {
+    std::vector<BufferBind> result;
+    int binding = 0;
+    result.push_back({BuffersEnum::Root, binding++});
+    result.push_back({BuffersEnum::GlobalTmps, binding++});
+    if (!ctx_attribs_->empty()) {
+      result.push_back({BuffersEnum::Context, binding++});
+    }
+    return result;
+  }
+
+  class SectionGuard {
+   public:
+    SectionGuard(TaskCodegen *tcg, Section new_sec)
+        : tcg_(tcg), saved_(tcg->code_section_) {
+      tcg_->code_section_ = new_sec;
+    }
+
+    ~SectionGuard() {
+      tcg_->code_section_ = saved_;
+    }
+
+   private:
+    TaskCodegen *const tcg_;
+    const Section saved_;
+  };
+
+  friend class SectionGuard;
+
+  template <typename... Args>
+  void emit(std::string f, Args &&... args) {
+    // TI_INFO(f, args...);
+    current_appender().append(std::move(f), std::forward<Args>(args)...);
+  }
+
+  void push_indent() {
+    current_appender().push_indent();
+  }
+
+  void pop_indent() {
+    current_appender().pop_indent();
+  }
+
+  LineAppender &current_appender() {
+    return section_appenders_[code_section_];
+  }
+
+  OffloadedStmt *const task_ir_;                        // not owned
+  const CompiledSNodeStructs *const compiled_structs_;  // not owned
+  const KernelContextAttributes *const ctx_attribs_;    // not owned
+  const std::string task_name_;
+
+  TaskAttributes task_attribs_;
+  GetRootStmt *root_stmt_{nullptr};
+  std::unordered_map<const Stmt *, BuffersEnum> ptr_to_buffers_;
+  Section code_section_{Section::Kernels};
+  std::unordered_map<Section, LineAppender> section_appenders_;
+};
+
+class KernelCodegen {
+ public:
+  struct Params {
+    std::string ti_kernel_name;
+    Kernel *kernel;
+    const CompiledSNodeStructs *compiled_structs;
+  };
+
+  explicit KernelCodegen(const Params &params)
+      : params_(params), ctx_attribs_(*params.kernel) {
+  }
+
+  using Result = VkRuntime::RegisterParams;
+
+  Result run() {
+    Result res;
+    auto &kernel_attribs = res.kernel_attribs;
+    auto *root = params_.kernel->ir->as<Block>();
+    auto &tasks = root->statements;
+    for (int i = 0; i < tasks.size(); ++i) {
+      TaskCodegen::Params tp;
+      tp.task_ir = tasks[i]->as<OffloadedStmt>();
+      tp.task_id_in_kernel = i;
+      tp.compiled_structs = params_.compiled_structs;
+      tp.ctx_attribs = &ctx_attribs_;
+      tp.ti_kernel_name = params_.ti_kernel_name;
+
+      TaskCodegen cgen(tp);
+      auto task_res = cgen.run();
+      kernel_attribs.tasks_attribs.push_back(std::move(task_res.task_attribs));
+      res.task_glsl_source_codes.push_back(std::move(task_res.source_code));
+    }
+    kernel_attribs.ctx_attribs = std::move(ctx_attribs_);
+    kernel_attribs.name = params_.ti_kernel_name;
+    kernel_attribs.is_jit_evaluator = params_.kernel->is_evaluator;
+    return res;
+  }
+
+ private:
+  Params params_;
+  KernelContextAttributes ctx_attribs_;
+};
+
+}  // namespace
+
+void lower(Kernel *kernel) {
+  auto &config = kernel->program->config;
+  config.demote_dense_struct_fors = true;
+  irpass::compile_to_executable(kernel->ir.get(), config, kernel,
+                                /*vectorize=*/false, kernel->grad,
+                                /*ad_use_stack=*/false, config.print_ir,
+                                /*lower_global_access=*/true,
+                                /*make_thread_local=*/false);
+}
+
+FunctionType compile_to_executable(Kernel *kernel,
+                                   const CompiledSNodeStructs *compiled_structs,
+                                   VkRuntime *runtime) {
+  const auto id = Program::get_kernel_id();
+  const auto taichi_kernel_name(fmt::format("{}_k{:04d}_vk", kernel->name, id));
+  TI_INFO("VK codegen for Taichi kernel={}", taichi_kernel_name);
+  KernelCodegen::Params params;
+  params.ti_kernel_name = taichi_kernel_name;
+  params.kernel = kernel;
+  params.compiled_structs = compiled_structs;
+  KernelCodegen codegen(params);
+  auto res = codegen.run();
+  auto handle = runtime->register_taichi_kernel(std::move(res));
+  return [runtime, handle, taichi_kernel_name](Context &ctx) {
+    runtime->launch_kernel(handle, &ctx);
+  };
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/codegen_vulkan.h b/taichi/backends/vulkan/codegen_vulkan.h
new file mode 100644
index 000000000..64210d95e
--- /dev/null
+++ b/taichi/backends/vulkan/codegen_vulkan.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "taichi/lang_util.h"
+
+#include "taichi/backends/vulkan/snode_struct_compiler.h"
+
+namespace taichi {
+namespace lang {
+
+class Kernel;
+
+namespace vulkan {
+
+class VkRuntime;
+
+void lower(Kernel *kernel);
+
+// These ASTs must have already been lowered at the CHI level.
+FunctionType compile_to_executable(Kernel *kernel,
+                                   const CompiledSNodeStructs *compiled_structs,
+                                   VkRuntime *runtime);
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/shaders/atomics.glsl.h b/taichi/backends/vulkan/shaders/atomics.glsl.h
new file mode 100644
index 000000000..1c183b6f4
--- /dev/null
+++ b/taichi/backends/vulkan/shaders/atomics.glsl.h
@@ -0,0 +1,44 @@
+// clang-format on
+#include "taichi/backends/vulkan/shaders/prologue.h"
+
+#ifndef TI_INSIDE_VULKAN_CODEGEN
+static_assert(false, "do not include");
+#endif  // TI_INSIDE_VULKAN_CODEGEN
+
+#define VULKAN_BEGIN_CODE_DEF constexpr auto kVulkanAtomicsSourceCode =
+#define VULKAN_END_CODE_DEF ;
+
+// clang-format off
+VULKAN_BEGIN_CODE_DEF
+STR(
+// TODO: don't duplicate, pass in pointer
+float fatomicAdd_root_buffer(int addr, float data) {
+  int old_val = 0;
+  int new_val = 0;
+  int cas_val = 0;
+  int ok = 0;
+  while (ok == 0) {
+    old_val = root_buffer[addr];
+    new_val = floatBitsToInt(intBitsToFloat(old_val) + data);
+    cas_val = atomicCompSwap(root_buffer[addr], old_val, new_val);
+    ok = int(cas_val == old_val);
+  }
+  return intBitsToFloat(old_val);
+}
+
+float fatomicAdd_global_tmps_buffer(int addr, float data) {
+  int old_val = 0;
+  int new_val = 0;
+  int cas_val = 0;
+  int ok = 0;
+  while (ok == 0) {
+    old_val = global_tmps_buffer[addr];
+    new_val = floatBitsToInt(intBitsToFloat(old_val) + data);
+    cas_val = atomicCompSwap(global_tmps_buffer[addr], old_val, new_val);
+    ok = int(cas_val == old_val);
+  }
+  return intBitsToFloat(old_val);
+}
+)
+VULKAN_END_CODE_DEF
+// clang-format on
diff --git a/taichi/backends/vulkan/shaders/epilogue.h b/taichi/backends/vulkan/shaders/epilogue.h
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/taichi/backends/vulkan/shaders/epilogue.h
@@ -0,0 +1 @@
+
diff --git a/taichi/backends/vulkan/shaders/prologue.h b/taichi/backends/vulkan/shaders/prologue.h
new file mode 100644
index 000000000..bbacdf2ef
--- /dev/null
+++ b/taichi/backends/vulkan/shaders/prologue.h
@@ -0,0 +1,28 @@
+#ifdef TI_INSIDE_VULKAN_CODEGEN
+
+#include "taichi/util/macros.h"
+
+#else
+
+#define STR(...) __VA_ARGS__
+
+#define inout
+
+// GLSL builtin stubs
+int floatBitsToInt(float f) {
+  return *reinterpret_cast<int *>(&f);
+}
+
+int intBitsToFloat(float f) {
+  return *reinterpret_cast<int *>(&f);
+}
+
+int atomicCompSwap(int &mem, int compare, int data) {
+  const int old = mem;
+  if (mem == compare) {
+    mem = data;
+  }
+  return old;
+}
+
+#endif  // TI_INSIDE_VULKAN_CODEGEN
diff --git a/taichi/backends/vulkan/snode_struct_compiler.cpp b/taichi/backends/vulkan/snode_struct_compiler.cpp
new file mode 100644
index 000000000..e586cd582
--- /dev/null
+++ b/taichi/backends/vulkan/snode_struct_compiler.cpp
@@ -0,0 +1,90 @@
+#include "taichi/backends/vulkan/snode_struct_compiler.h"
+
+#include "taichi/backends/vulkan/data_type_utils.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+namespace {
+
+class StructCompiler {
+ public:
+  CompiledSNodeStructs run(const SNode &root) {
+    TI_ASSERT(root.type == SNodeType::root);
+
+    CompiledSNodeStructs result;
+    result.root_size = compute_snode_size(&root);
+    result.snode_descriptors = std::move(snode_descriptors_);
+    TI_INFO("Vulkan RootBuffer size={}", result.root_size);
+    return result;
+  }
+
+ private:
+  std::size_t compute_snode_size(const SNode *sn) {
+    const bool is_place = sn->is_place();
+
+    SNodeDescriptor sn_desc;
+    sn_desc.snode = sn;
+    if (is_place) {
+      sn_desc.cell_stride = vk_data_type_size(sn->dt);
+      sn_desc.container_stride = sn_desc.cell_stride;
+    } else {
+      std::size_t cell_stride = 0;
+      for (const auto &ch : sn->ch) {
+        const auto child_offset = cell_stride;
+        const auto *ch_snode = ch.get();
+        cell_stride += compute_snode_size(ch_snode);
+        snode_descriptors_.find(ch_snode->id)
+            ->second.mem_offset_in_parent_cell = child_offset;
+      }
+      sn_desc.cell_stride = cell_stride;
+      sn_desc.container_stride =
+          cell_stride * sn_desc.cells_per_container_pot();
+    }
+
+    sn_desc.total_num_cells_from_root = 1;
+    for (const auto &e : sn->extractors) {
+      // Note that the extractors are set in two places:
+      // 1. When a new SNode is first defined
+      // 2. StructCompiler::infer_snode_properties()
+      // The second step is the finalized result.
+      sn_desc.total_num_cells_from_root *= e.num_elements;
+    }
+
+    TI_INFO("SNodeDescriptor");
+    TI_INFO("* snode={}", sn_desc.snode->id);
+    TI_INFO("* type={} (is_place={})", sn_desc.snode->node_type_name, is_place);
+    TI_INFO("* cell_stride={}", sn_desc.cell_stride);
+    TI_INFO("* cells_per_container_pot={}", sn_desc.cells_per_container_pot());
+    TI_INFO("* container_stride={}", sn_desc.container_stride);
+    TI_INFO("* total_num_cells_from_root={}",
+            sn_desc.total_num_cells_from_root);
+    TI_INFO("");
+
+    TI_ASSERT(snode_descriptors_.find(sn->id) == snode_descriptors_.end());
+    snode_descriptors_[sn->id] = sn_desc;
+    return sn_desc.container_stride;
+  }
+
+  SNodeDescriptorsMap snode_descriptors_;
+};
+
+}  // namespace
+
+int SNodeDescriptor::cells_per_container_pot() const {
+  // For root, |snode->n| is 0.
+  const auto ty = snode->type;
+  if (ty == SNodeType::root || ty == SNodeType::place) {
+    return 1;
+  }
+  return snode->n;
+}
+
+CompiledSNodeStructs compile_snode_structs(const SNode &root) {
+  StructCompiler compiler;
+  return compiler.run(root);
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/snode_struct_compiler.h b/taichi/backends/vulkan/snode_struct_compiler.h
new file mode 100644
index 000000000..426c96ee0
--- /dev/null
+++ b/taichi/backends/vulkan/snode_struct_compiler.h
@@ -0,0 +1,53 @@
+// Codegen for the hierarchical data structure
+#pragma once
+
+#include <unordered_map>
+
+#include "taichi/ir/snode.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+struct SNodeDescriptor {
+  const SNode *snode = nullptr;
+  // Stride (bytes) of a single cell.
+  int cell_stride = 0;
+
+  // Number of cells per container, padded to Power of Two (pot).
+  int cells_per_container_pot() const;
+
+  // Bytes of a single container.
+  int container_stride = 0;
+
+  // Total number of CELLS of this SNode, NOT padded to PoT.
+  // For example, for a layout of
+  // ti.root
+  //   .dense(ti.ij, (3, 2))  // S1
+  //   .dense(ti.ij, (5, 3))  // S2
+  // |total_num_cells_from_root| for S2 is 3x2x5x3 = 90. That is, S2 has a total
+  // of 90 cells. Note that the number of S2 (container) itself is 3x2=6!
+  int total_num_cells_from_root = 0;
+  // An SNode can have multiple number of components, where each component
+  // starts at a fixed offset in its parent cell's memory.
+  int mem_offset_in_parent_cell = 0;
+
+  SNode *get_child(int ch_i) const {
+    return snode->ch[ch_i].get();
+  }
+};
+
+using SNodeDescriptorsMap = std::unordered_map<int, SNodeDescriptor>;
+
+struct CompiledSNodeStructs {
+  // Root buffer size in bytes.
+  size_t root_size;
+  // Map from SNode ID to its descriptor.
+  SNodeDescriptorsMap snode_descriptors;
+};
+
+CompiledSNodeStructs compile_snode_structs(const SNode &root);
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp b/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp
new file mode 100644
index 000000000..72ee792e5
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp
@@ -0,0 +1,143 @@
+#include "taichi/backends/vulkan/vulkan_simple_memory_pool.h"
+
+#include "taichi/math/arithmetic.h"
+#include "taichi/backends/vulkan/vulkan_common.h"
+#include "taichi/common/logging.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+namespace {
+
+static constexpr VkDeviceSize kAlignment = 256;
+
+VkDeviceSize roundup_aligned(VkDeviceSize size) {
+  return iroundup(size, kAlignment);
+}
+
+}  // namespace
+
+VkBufferWithMemory::VkBufferWithMemory(VkDevice device,
+                                       VkBuffer buffer,
+                                       VkDeviceMemory mem,
+                                       VkDeviceSize size,
+                                       VkDeviceSize offset)
+    : device_(device),
+      buffer_(buffer),
+      backing_memory_(mem),
+      size_(size),
+      offset_in_mem_(offset) {
+  TI_ASSERT(buffer_ != VK_NULL_HANDLE);
+  TI_ASSERT(size_ > 0);
+  TI_ASSERT(backing_memory_ != VK_NULL_HANDLE);
+}
+
+VkBufferWithMemory::~VkBufferWithMemory() {
+  if (buffer_ != VK_NULL_HANDLE) {
+    vkDestroyBuffer(device_, buffer_, kNoVkAllocCallbacks);
+  }
+}
+
+LinearVkMemoryPool::LinearVkMemoryPool(const Params &params,
+                                       VkDeviceMemory mem,
+                                       uint32_t mti)
+    : device_(params.device),
+      memory_(mem),
+      memory_type_index_(mti),
+      compute_queue_family_index_(params.compute_queue_family_index),
+      buffer_creation_template_(params.buffer_creation_template),
+      pool_size_(params.pool_size),
+      next_(0) {
+  buffer_creation_template_.size = 0;
+  buffer_creation_template_.queueFamilyIndexCount = 1;
+  buffer_creation_template_.pQueueFamilyIndices = &compute_queue_family_index_;
+}
+
+LinearVkMemoryPool::~LinearVkMemoryPool() {
+  if (memory_ != VK_NULL_HANDLE) {
+    vkFreeMemory(device_, memory_, kNoVkAllocCallbacks);
+  }
+}
+
+// static
+std::unique_ptr<LinearVkMemoryPool> LinearVkMemoryPool::try_make(
+    Params params) {
+  params.pool_size = roundup_aligned(params.pool_size);
+
+  VkMemoryAllocateInfo alloc_info{};
+  alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  alloc_info.allocationSize = params.pool_size;
+  const auto mem_type_index = find_memory_type(params);
+  if (!mem_type_index.has_value()) {
+    return nullptr;
+  }
+  alloc_info.memoryTypeIndex = mem_type_index.value();
+  VkDeviceMemory mem;
+  if (vkAllocateMemory(params.device, &alloc_info, kNoVkAllocCallbacks, &mem) !=
+      VK_SUCCESS) {
+    return nullptr;
+  }
+  return std::make_unique<LinearVkMemoryPool>(params, mem,
+                                              alloc_info.memoryTypeIndex);
+}
+
+std::unique_ptr<VkBufferWithMemory> LinearVkMemoryPool::alloc_and_bind(
+    VkDeviceSize buf_size) {
+  buf_size = roundup_aligned(buf_size);
+  if (pool_size_ <= (next_ + buf_size)) {
+    TI_WARN("Vulkan memory pool exhausted, max size={}", pool_size_);
+    return nullptr;
+  }
+
+  VkBuffer buffer;
+  buffer_creation_template_.size = buf_size;
+  BAIL_ON_VK_BAD_RESULT(vkCreateBuffer(device_, &buffer_creation_template_,
+                                       kNoVkAllocCallbacks, &buffer),
+                        "failed to create buffer");
+  buffer_creation_template_.size = 0;  // reset
+  const auto offset_in_mem = next_;
+  next_ += buf_size;
+  BAIL_ON_VK_BAD_RESULT(
+      vkBindBufferMemory(device_, buffer, memory_, offset_in_mem),
+      "failed to bind buffer to memory");
+
+  VkMemoryRequirements mem_requirements;
+  vkGetBufferMemoryRequirements(device_, buffer, &mem_requirements);
+  TI_ASSERT(mem_requirements.memoryTypeBits & (1 << memory_type_index_));
+  TI_ASSERT_INFO((buf_size % mem_requirements.alignment) == 0,
+                 "buf_size={} required alignment={}", buf_size,
+                 mem_requirements.alignment);
+  return std::make_unique<VkBufferWithMemory>(device_, buffer, memory_,
+                                              buf_size, offset_in_mem);
+}
+
+// static
+std::optional<uint32_t> LinearVkMemoryPool::find_memory_type(
+    const Params &params) {
+  VkPhysicalDeviceMemoryProperties mem_properties;
+  vkGetPhysicalDeviceMemoryProperties(params.physical_device, &mem_properties);
+  auto satisfies = [&](int i) -> bool {
+    const auto &mem_type = mem_properties.memoryTypes[i];
+    if ((mem_type.propertyFlags & params.required_properties) !=
+        params.required_properties) {
+      return false;
+    }
+    if (mem_properties.memoryHeaps[mem_type.heapIndex].size <=
+        params.pool_size) {
+      return false;
+    }
+    return true;
+  };
+
+  for (int i = 0; i < mem_properties.memoryTypeCount; ++i) {
+    if (satisfies(i)) {
+      return i;
+    }
+  }
+  return std::nullopt;
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_simple_memory_pool.h b/taichi/backends/vulkan/vulkan_simple_memory_pool.h
new file mode 100644
index 000000000..9e1023f0a
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_simple_memory_pool.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_core.h>
+
+#include <memory>
+#include <optional>
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+class VkBufferWithMemory {
+ public:
+  VkBufferWithMemory(VkDevice device,
+                     VkBuffer buffer,
+                     VkDeviceMemory mem,
+                     VkDeviceSize size,
+                     VkDeviceSize offset);
+
+  // Just use std::unique_ptr to save all the trouble from crafting move ctors
+  // on our own
+  VkBufferWithMemory(const VkBufferWithMemory &) = delete;
+  VkBufferWithMemory &operator=(const VkBufferWithMemory &) = delete;
+  VkBufferWithMemory(VkBufferWithMemory &&) = delete;
+  VkBufferWithMemory &operator=(VkBufferWithMemory &&) = delete;
+
+  ~VkBufferWithMemory();
+
+  VkBuffer buffer() const {
+    return buffer_;
+  }
+
+  VkDeviceSize size() const {
+    return size_;
+  }
+
+  VkDeviceSize offset_in_mem() const {
+    return offset_in_mem_;
+  }
+
+  class Mapped {
+   public:
+    explicit Mapped(VkBufferWithMemory *buf) : buf_(buf), data_(nullptr) {
+      vkMapMemory(buf_->device_, buf_->backing_memory_, buf_->offset_in_mem(),
+                  buf_->size(), /*flags=*/0, &data_);
+    }
+
+    ~Mapped() {
+      vkUnmapMemory(buf_->device_, buf_->backing_memory_);
+    }
+
+    void *data() const {
+      return data_;
+    }
+
+   private:
+    VkBufferWithMemory *const buf_;  // not owned
+    void *data_;
+  };
+
+  Mapped map_mem() {
+    return Mapped(this);
+  }
+
+ private:
+  friend class Mapped;
+
+  VkDevice device_{VK_NULL_HANDLE};
+  VkBuffer buffer_{VK_NULL_HANDLE};
+  VkDeviceMemory backing_memory_{VK_NULL_HANDLE};
+  VkDeviceSize size_{0};
+  VkDeviceSize offset_in_mem_{0};
+};
+
+// TODO: Use
+// https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/custom_memory_pools.html
+class LinearVkMemoryPool {
+ public:
+  struct Params {
+    VkPhysicalDevice physical_device{VK_NULL_HANDLE};
+    VkDevice device{VK_NULL_HANDLE};
+    VkMemoryPropertyFlags required_properties;
+    VkDeviceSize pool_size{0};
+    uint32_t compute_queue_family_index{0};
+    VkBufferCreateInfo buffer_creation_template{};
+  };
+
+  LinearVkMemoryPool(const Params &params, VkDeviceMemory mem, uint32_t mti);
+
+  ~LinearVkMemoryPool();
+
+  static std::unique_ptr<LinearVkMemoryPool> try_make(Params params);
+
+  std::unique_ptr<VkBufferWithMemory> alloc_and_bind(VkDeviceSize buf_size);
+
+ private:
+  static std::optional<uint32_t> find_memory_type(const Params &params);
+
+  VkDevice device_{VK_NULL_HANDLE};  // not owned
+  VkDeviceMemory memory_{VK_NULL_HANDLE};
+  uint32_t memory_type_index_{0};
+  uint32_t compute_queue_family_index_{0};
+  VkBufferCreateInfo buffer_creation_template_{};
+  VkDeviceSize pool_size_{0};
+  VkDeviceSize next_{0};
+};
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_utils.cpp b/taichi/backends/vulkan/vulkan_utils.cpp
new file mode 100644
index 000000000..d0fc7d60f
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_utils.cpp
@@ -0,0 +1,54 @@
+#include "taichi/backends/vulkan/vulkan_utils.h"
+
+#include <spirv-tools/libspirv.hpp>
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+
+std::vector<VkExtensionProperties> GetInstanceExtensionProperties() {
+  constexpr char *kNoLayerName = nullptr;
+  uint32_t count = 0;
+  vkEnumerateInstanceExtensionProperties(kNoLayerName, &count, nullptr);
+  std::vector<VkExtensionProperties> extensions(count);
+  vkEnumerateInstanceExtensionProperties(kNoLayerName, &count,
+                                         extensions.data());
+  return extensions;
+}
+
+std::vector<VkExtensionProperties> GetDeviceExtensionProperties(
+    VkPhysicalDevice physicalDevice) {
+  constexpr char *kNoLayerName = nullptr;
+  uint32_t count = 0;
+  vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count,
+                                       nullptr);
+  std::vector<VkExtensionProperties> extensions(count);
+  vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count,
+                                       extensions.data());
+  return extensions;
+}
+
+GlslToSpirvCompiler::GlslToSpirvCompiler(const ErrorHandler &err_handler)
+    : err_handler_(err_handler) {
+  opts_.SetTargetEnvironment(shaderc_target_env_vulkan,
+                             VulkanEnvSettings::kShadercEnvVersion());
+  opts_.SetOptimizationLevel(shaderc_optimization_level_performance);
+}
+
+std::optional<GlslToSpirvCompiler::SpirvBinary> GlslToSpirvCompiler::compile(
+    const std::string &glsl_src,
+    const std::string &shader_name) {
+  auto spv_result =
+      compiler_.CompileGlslToSpv(glsl_src, shaderc_glsl_default_compute_shader,
+                                 /*input_file_name=*/shader_name.c_str(),
+                                 /*entry_point_name=*/"main", opts_);
+  if (spv_result.GetCompilationStatus() != shaderc_compilation_status_success) {
+    err_handler_(glsl_src, shader_name, spv_result.GetErrorMessage());
+    return std::nullopt;
+  }
+  return SpirvBinary(spv_result.begin(), spv_result.end());
+}
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_utils.h b/taichi/backends/vulkan/vulkan_utils.h
new file mode 100644
index 000000000..6f01e9f1f
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_utils.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <vulkan/vulkan.h>
+
+#include <functional>
+#include <optional>
+#include <shaderc/shaderc.hpp>
+#include <string>
+#include <vector>
+
+namespace taichi {
+namespace lang {
+
+namespace vulkan {
+
+std::vector<VkExtensionProperties> GetInstanceExtensionProperties();
+
+std::vector<VkExtensionProperties> GetDeviceExtensionProperties(
+    VkPhysicalDevice physicalDevice);
+
+class VulkanEnvSettings {
+ public:
+  static constexpr uint32_t kApiVersion() {
+    return VK_API_VERSION_1_0;
+  }
+
+  static constexpr shaderc_env_version kShadercEnvVersion() {
+    return shaderc_env_version_vulkan_1_0;
+  }
+};
+
+class GlslToSpirvCompiler {
+ public:
+  using SpirvBinary = std::vector<uint32_t>;
+  using ErrorHandler = std::function<void(const std::string & /*glsl_src*/,
+                                          const std::string & /*shader_name*/,
+                                          const std::string & /*err_msg*/)>;
+
+  explicit GlslToSpirvCompiler(const ErrorHandler &err_handler);
+
+  std::optional<SpirvBinary> compile(const std::string &glsl_src,
+                                     const std::string &shader_name);
+
+ private:
+  shaderc::CompileOptions opts_;
+  shaderc::Compiler compiler_;
+  ErrorHandler err_handler_{nullptr};
+};
+
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/wasm/codegen_wasm.cpp b/taichi/backends/wasm/codegen_wasm.cpp
index 1649f11e8..cf5d68224 100644
--- a/taichi/backends/wasm/codegen_wasm.cpp
+++ b/taichi/backends/wasm/codegen_wasm.cpp
@@ -13,6 +13,12 @@
 namespace taichi {
 namespace lang {
 
+namespace {
+constexpr std::array<const char *, 3> kPreloadedFuncNames = {
+    "wasm_materialize", "wasm_set_kernel_parameter_i32",
+    "wasm_set_kernel_parameter_f32"};
+}
+
 class CodeGenLLVMWASM : public CodeGenLLVM {
  public:
   using IRVisitor::visit;
@@ -173,14 +179,16 @@ class CodeGenLLVMWASM : public CodeGenLLVM {
     ir->accept(this);
     finalize_taichi_kernel_function();
 
-    auto wasm_materialize_name = "wasm_materialize";
-
     // compile_module_to_executable
     // only keep the current func
     TaichiLLVMContext::eliminate_unused_functions(
-        module.get(), [&](std::string func_name) {
-          return offloaded_task_name == func_name ||
-                 wasm_materialize_name == func_name;
+        module.get(), [offloaded_task_name](const std::string &func_name) {
+          for (auto &name : kPreloadedFuncNames) {
+            if (std::string(name) == func_name) {
+              return true;
+            }
+          }
+          return func_name == offloaded_task_name;
         });
     tlctx->add_module(std::move(module));
     auto kernel_symbol = tlctx->lookup_function_pointer(offloaded_task_name);
@@ -199,9 +207,6 @@ FunctionType CodeGenWASM::codegen() {
 
 std::unique_ptr<ModuleGenValue> CodeGenWASM::modulegen(
     std::unique_ptr<llvm::Module> &&module) {
-  /*
-    TODO: move wasm_materialize to dump process in AOT.
-  */
   bool init_flag = module == nullptr;
   std::vector<std::string> name_list;
 
@@ -211,8 +216,11 @@ std::unique_ptr<ModuleGenValue> CodeGenWASM::modulegen(
   gen->emit_to_module();
   gen->finalize_taichi_kernel_function();
 
+  // TODO: move the following functions to dump process in AOT.
   if (init_flag) {
-    name_list.emplace_back("wasm_materialize");
+    for (auto &name : kPreloadedFuncNames) {
+      name_list.emplace_back(name);
+    }
   }
 
   gen->tlctx->jit->global_optimize_module(gen->module.get());
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index ec2b99933..a02f3bdc2 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -375,6 +375,7 @@ class GlobalVariableExpression : public Expression {
  public:
   Identifier ident;
   DataType dt;
+  std::string name;
   SNode *snode;
   bool has_ambient;
   TypedConstant ambient_value;
diff --git a/taichi/ir/type_utils.cpp b/taichi/ir/type_utils.cpp
index dffe9ff99..7ba723994 100644
--- a/taichi/ir/type_utils.cpp
+++ b/taichi/ir/type_utils.cpp
@@ -25,17 +25,11 @@ std::string data_type_format(DataType dt) {
   } else if (dt->is_primitive(PrimitiveTypeID::u32)) {
     return "%u";
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
-#if defined(TI_PLATFORM_UNIX)
+    // Use %lld on Windows.
+    // Discussion: https://github.com/taichi-dev/taichi/issues/2522
     return "%lld";
-#else
-    return "%I64d";
-#endif
   } else if (dt->is_primitive(PrimitiveTypeID::u64)) {
-#if defined(TI_PLATFORM_UNIX)
     return "%llu";
-#else
-    return "%I64u";
-#endif
   } else if (dt->is_primitive(PrimitiveTypeID::f32)) {
     return "%f";
   } else if (dt->is_primitive(PrimitiveTypeID::f64)) {
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index 19c46efc9..9597a2523 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -230,13 +230,6 @@ void compile_runtime_bitcode(Arch arch) {
   }
 }
 
-void compile_runtimes() {
-  compile_runtime_bitcode(host_arch());
-#if defined(TI_WITH_CUDA)
-  compile_runtime_bitcode(Arch::cuda);
-#endif
-}
-
 std::string libdevice_path() {
   std::string folder;
   if (is_release()) {
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index a5e3ab019..cf884cbc4 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -276,25 +276,25 @@ Context &Kernel::LaunchContextBuilder::get_context() {
 float64 Kernel::get_ret_float(int i) {
   auto dt = rets[i].dt->get_compute_type();
   if (dt->is_primitive(PrimitiveTypeID::f32)) {
-    return (float64)get_current_program().fetch_result<float32>(i);
+    return (float64)program->fetch_result<float32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::f64)) {
-    return (float64)get_current_program().fetch_result<float64>(i);
+    return (float64)program->fetch_result<float64>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i32)) {
-    return (float64)get_current_program().fetch_result<int32>(i);
+    return (float64)program->fetch_result<int32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
-    return (float64)get_current_program().fetch_result<int64>(i);
+    return (float64)program->fetch_result<int64>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i8)) {
-    return (float64)get_current_program().fetch_result<int8>(i);
+    return (float64)program->fetch_result<int8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
-    return (float64)get_current_program().fetch_result<int16>(i);
+    return (float64)program->fetch_result<int16>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
-    return (float64)get_current_program().fetch_result<uint8>(i);
+    return (float64)program->fetch_result<uint8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
-    return (float64)get_current_program().fetch_result<uint16>(i);
+    return (float64)program->fetch_result<uint16>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u32)) {
-    return (float64)get_current_program().fetch_result<uint32>(i);
+    return (float64)program->fetch_result<uint32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u64)) {
-    return (float64)get_current_program().fetch_result<uint64>(i);
+    return (float64)program->fetch_result<uint64>(i);
   } else {
     TI_NOT_IMPLEMENTED
   }
@@ -303,25 +303,25 @@ float64 Kernel::get_ret_float(int i) {
 int64 Kernel::get_ret_int(int i) {
   auto dt = rets[i].dt->get_compute_type();
   if (dt->is_primitive(PrimitiveTypeID::i32)) {
-    return (int64)get_current_program().fetch_result<int32>(i);
+    return (int64)program->fetch_result<int32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i64)) {
-    return (int64)get_current_program().fetch_result<int64>(i);
+    return (int64)program->fetch_result<int64>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i8)) {
-    return (int64)get_current_program().fetch_result<int8>(i);
+    return (int64)program->fetch_result<int8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::i16)) {
-    return (int64)get_current_program().fetch_result<int16>(i);
+    return (int64)program->fetch_result<int16>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u8)) {
-    return (int64)get_current_program().fetch_result<uint8>(i);
+    return (int64)program->fetch_result<uint8>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u16)) {
-    return (int64)get_current_program().fetch_result<uint16>(i);
+    return (int64)program->fetch_result<uint16>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u32)) {
-    return (int64)get_current_program().fetch_result<uint32>(i);
+    return (int64)program->fetch_result<uint32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::u64)) {
-    return (int64)get_current_program().fetch_result<uint64>(i);
+    return (int64)program->fetch_result<uint64>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::f32)) {
-    return (int64)get_current_program().fetch_result<float32>(i);
+    return (int64)program->fetch_result<float32>(i);
   } else if (dt->is_primitive(PrimitiveTypeID::f64)) {
-    return (int64)get_current_program().fetch_result<float64>(i);
+    return (int64)program->fetch_result<float64>(i);
   } else {
     TI_NOT_IMPLEMENTED
   }
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 456c74bb3..b8c88894e 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -586,6 +586,10 @@ void Program::async_flush() {
   async_engine->flush();
 }
 
+int Program::get_snode_tree_size() {
+  return snode_trees_.size();
+}
+
 std::string capitalize_first(std::string s) {
   s[0] = std::toupper(s[0]);
   return s;
diff --git a/taichi/program/program.h b/taichi/program/program.h
index c9c392a48..1eb7ae9ae 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -169,6 +169,8 @@ class Program {
    */
   void materialize_runtime();
 
+  int get_snode_tree_size();
+
   void visualize_layout(const std::string &fn);
 
   struct KernelProxy {
diff --git a/taichi/program/snode_expr_utils.cpp b/taichi/program/snode_expr_utils.cpp
index b83e7e7c0..02e7ee3c3 100644
--- a/taichi/program/snode_expr_utils.cpp
+++ b/taichi/program/snode_expr_utils.cpp
@@ -67,7 +67,11 @@ void place_child(Expr *expr_arg,
     }
     auto &child = parent->insert_children(SNodeType::place);
     glb_var_expr->set_snode(&child);
-    child.name = glb_var_expr->ident.raw_name();
+    if (glb_var_expr->name == "") {
+      child.name = glb_var_expr->ident.raw_name();
+    } else {
+      child.name = glb_var_expr->name;
+    }
     if (glb_var_expr->has_ambient) {
       glb_var_expr->snode->has_ambient = true;
       glb_var_expr->snode->ambient_val = glb_var_expr->ambient_value;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 918cb24e6..ed469c619 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -55,7 +55,6 @@ void expr_assign(const Expr &lhs_, const Expr &rhs, std::string tb) {
 
 std::vector<std::unique_ptr<ASTBuilder::ScopeGuard>> scope_stack;
 
-void compile_runtimes();
 std::string libdevice_path();
 std::string get_runtime_dir();
 
@@ -231,7 +230,10 @@ void export_lang(py::module &m) {
       .def("synchronize", &Program::synchronize)
       .def("async_flush", &Program::async_flush)
       .def("materialize_runtime", &Program::materialize_runtime)
-      .def("make_aot_module_builder", &Program::make_aot_module_builder);
+      .def("make_aot_module_builder", &Program::make_aot_module_builder)
+      .def("get_snode_tree_size", &Program::get_snode_tree_size)
+      .def("get_snode_root", &Program::get_snode_root,
+           py::return_value_policy::reference);
 
   py::class_<AotModuleBuilder>(m, "AotModuleBuilder")
       .def("add", &AotModuleBuilder::add)
@@ -276,6 +278,7 @@ void export_lang(py::module &m) {
                          get_current_program().get_snode_to_glb_var_exprs());
            })
       .def("data_type", [](SNode *snode) { return snode->dt; })
+      .def("name", [](SNode *snode) { return snode->name; })
       .def("get_num_ch",
            [](SNode *snode) -> int { return (int)snode->ch.size(); })
       .def(
@@ -358,13 +361,25 @@ void export_lang(py::module &m) {
            [](Expr *expr) { return expr->is<GlobalVariableExpression>(); })
       .def("is_external_var",
            [](Expr *expr) { return expr->is<ExternalTensorExpression>(); })
+      .def("is_primal",
+           [](Expr *expr) {
+             return expr->cast<GlobalVariableExpression>()->is_primal;
+           })
       .def("set_tb", &Expr::set_tb)
+      .def("set_name",
+           [&](Expr *expr, std::string na) {
+             expr->cast<GlobalVariableExpression>()->name = na;
+           })
       .def("set_is_primal",
            [&](Expr *expr, bool v) {
              expr->cast<GlobalVariableExpression>()->is_primal = v;
            })
       .def("set_grad", &Expr::set_grad)
       .def("set_attribute", &Expr::set_attribute)
+      .def("get_expr_name",
+           [](Expr *expr) {
+             return expr->cast<GlobalVariableExpression>()->name;
+           })
       .def("get_attribute", &Expr::get_attribute)
       .def("get_raw_address", [](Expr *expr) { return (uint64)expr; })
       .def("get_underlying_ptr_address", [](Expr *e) {
@@ -744,7 +759,6 @@ void export_lang(py::module &m) {
   m.def("test_throw", [] { throw IRModified(); });
   m.def("needs_grad", needs_grad);
 
-  m.def("compile_runtimes", compile_runtimes);
   m.def("libdevice_path", libdevice_path);
 
   m.def("host_arch", host_arch);
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 43cb5a1ef..2888e7f19 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -1083,19 +1083,19 @@ i32 op_xor_i32(i32 a, i32 b) {
   return a ^ b;
 }
 
-#define DEFINE_REDUCTION(op, dtype)                                   \
-  dtype warp_reduce_##op##_##dtype(dtype val) {                       \
-    for (int offset = 16; offset > 0; offset /= 2)                    \
-      val = op_##op##_##dtype(                                        \
-          val, cuda_shfl_down_sync_i32(0xFFFFFFFF, val, offset, 31)); \
-    return val;                                                       \
-  }                                                                   \
-  dtype reduce_##op##_##dtype(dtype *result, dtype val) {             \
-    dtype warp_result = warp_reduce_##op##_##dtype(val);              \
-    if ((thread_idx() & (warp_size() - 1)) == 0) {                    \
-      atomic_##op##_##dtype(result, warp_result);                     \
-    }                                                                 \
-    return val;                                                       \
+#define DEFINE_REDUCTION(op, dtype)                                       \
+  dtype warp_reduce_##op##_##dtype(dtype val) {                           \
+    for (int offset = 16; offset > 0; offset /= 2)                        \
+      val = op_##op##_##dtype(                                            \
+          val, cuda_shfl_down_sync_##dtype(0xFFFFFFFF, val, offset, 31)); \
+    return val;                                                           \
+  }                                                                       \
+  dtype reduce_##op##_##dtype(dtype *result, dtype val) {                 \
+    dtype warp_result = warp_reduce_##op##_##dtype(val);                  \
+    if ((thread_idx() & (warp_size() - 1)) == 0) {                        \
+      atomic_##op##_##dtype(result, warp_result);                         \
+    }                                                                     \
+    return val;                                                           \
   }
 
 DEFINE_REDUCTION(add, i32);
@@ -1751,6 +1751,14 @@ i32 wasm_materialize(Context *context) {
       (Ptr)((size_t)context->runtime->rand_states + sizeof(RandState));
   return (i32)(size_t)context->runtime->roots[0];
 }
+
+void wasm_set_kernel_parameter_i32(Context *context, int index, i32 value) {
+  *(i32 *)(&context->args[index]) = value;
+}
+
+void wasm_set_kernel_parameter_f32(Context *context, int index, f32 value) {
+  *(f32 *)(&context->args[index]) = value;
+}
 }
 
 #endif
diff --git a/tests/conftest.py b/tests/python/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to tests/python/conftest.py
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index 363e71f5d..53dd5af13 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -92,3 +92,18 @@ def test_default_ip(dtype):
     x = ti.Vector.field(2, int, ())
 
     assert x.dtype == ti.get_runtime().default_ip
+
+
+@ti.test()
+def test_field_name():
+    a = ti.field(dtype=ti.f32, shape=(2, 3), name='a')
+    b = ti.Vector.field(3, dtype=ti.f32, shape=(2, 3), name='b')
+    c = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(5, 4), name='c')
+    assert a.name == 'a'
+    assert b.name == 'b'
+    assert c.name == 'c'
+    assert b.snode.name == 'b'
+    d = []
+    for i in range(10):
+        d.append(ti.field(dtype=ti.f32, shape=(2, 3), name=f'd{i}'))
+        assert d[i].name == f'd{i}'
diff --git a/tests/python/test_fields_builder.py b/tests/python/test_fields_builder.py
index f315b4692..a4edd92a7 100644
--- a/tests/python/test_fields_builder.py
+++ b/tests/python/test_fields_builder.py
@@ -13,18 +13,31 @@ def test_fields_with_shape():
     def func():
         for i in range(n):
             x[i] = i
-        for i in range(n):
-            assert x[i] == i
 
+    func()
+
+    for i in range(n):
+        assert x[i] == i
+
+    y = ti.field(ti.f32, [n])
+
+    @ti.kernel
+    def func2():
         for i in range(n):
-            x[i] = i * 2
+            y[i] = i * 2
         for i in range(n):
-            assert x[i] == i * 2
+            x[i] = i * 3
+
+    func2()
+
+    for i in range(n):
+        assert x[i] == i * 3
+        assert y[i] == i * 2
 
     func()
 
-    with pytest.raises(InvalidOperationError, match='FieldsBuilder finalized'):
-        y = ti.field(ti.f32, [n])
+    for i in range(n):
+        assert x[i] == i
 
 
 @ti.test(arch=[ti.cpu, ti.cuda])
@@ -72,7 +85,7 @@ def func2():
         assert x[i] == i * 3
 
 
-@ti.test(arch=[ti.cpu, ti.cuda])
+@ti.test(arch=[ti.cpu, ti.cuda], use_unified_memory=True)
 def test_fields_builder_pointer():
     n = 5
 
diff --git a/tests/python/test_loop_unique.py b/tests/python/test_loop_unique.py
index c4a33d15d..7ee0cce48 100644
--- a/tests/python/test_loop_unique.py
+++ b/tests/python/test_loop_unique.py
@@ -1,7 +1,7 @@
 import taichi as ti
 
 
-@ti.test(require=ti.extension.sparse)
+@ti.test(require=ti.extension.sparse, use_unified_memory=True)
 def test_loop_unique_simple_1d():
     x, y = ti.field(ti.i32), ti.field(ti.i32)
 
@@ -26,7 +26,7 @@ def inc_y():
         assert y[i] == expected_result.get(i, 0)
 
 
-@ti.test(require=ti.extension.sparse)
+@ti.test(require=ti.extension.sparse, use_unified_memory=True)
 def test_loop_unique_binary_op_1d():
     x, y = ti.field(ti.i32), ti.field(ti.i32)
 
@@ -51,7 +51,7 @@ def inc_y():
         assert y[i] == expected_result.get(i, 0)
 
 
-@ti.test(require=ti.extension.sparse)
+@ti.test(require=ti.extension.sparse, use_unified_memory=True)
 def test_loop_unique_nested_1d():
     x, y = ti.field(ti.i32), ti.field(ti.i32)
 
@@ -77,7 +77,7 @@ def inc_y():
         assert y[i] == expected_result.get(i, 0)
 
 
-@ti.test(require=ti.extension.sparse)
+@ti.test(require=ti.extension.sparse, use_unified_memory=True)
 def test_loop_unique_2d():
     x, y, z = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32)
 
diff --git a/tests/python/test_mpm_particle_list.py b/tests/python/test_mpm_particle_list.py
index 61e3febb0..de94bc4ed 100644
--- a/tests/python/test_mpm_particle_list.py
+++ b/tests/python/test_mpm_particle_list.py
@@ -33,7 +33,7 @@ def __init__(self, res):
     def build_pid(self):
         ti.block_dim(256)
         for p in self.x:
-            base = ti.floor(self.x[p] * self.inv_dx - 0.5).cast(int)
+            base = ti.floor(self.x[p] * self.inv_dx - 0.5).cast(int) + 1
             ti.append(self.pid.parent(), base, p)
 
     def step(self):
diff --git a/tests/python/test_no_grad.py b/tests/python/test_no_grad.py
index ef6445130..4aba4558a 100644
--- a/tests/python/test_no_grad.py
+++ b/tests/python/test_no_grad.py
@@ -1,3 +1,6 @@
+import numpy as np
+import pytest
+
 import taichi as ti
 
 
@@ -19,3 +22,23 @@ def func():
 
     with ti.Tape(loss):
         func()
+
+
+@ti.all_archs
+def test_raise_no_gradient():
+    y = ti.field(shape=(), name='y', dtype=ti.f64, needs_grad=True)
+    x = ti.field(shape=(), name='x', dtype=ti.f32)
+    z = np.array([1.0])
+
+    @ti.kernel
+    def func(x: ti.template()):
+        y[None] = x.grad[None] * x.grad[None]
+        z[0] = x.grad[None]
+
+    x[None] = 5.
+    with pytest.raises(RuntimeError) as e:
+        func(x)
+
+    assert e.type is RuntimeError
+    assert e.value.args[
+        0] == f"Gradient x.grad has not been placed, check whether `needs_grad=True`"
diff --git a/tests/python/test_reduction.py b/tests/python/test_reduction.py
index f390cbe6e..0b859665b 100644
--- a/tests/python/test_reduction.py
+++ b/tests/python/test_reduction.py
@@ -39,10 +39,18 @@ def _test_reduction_single(dtype, criterion, op):
     a = ti.field(dtype, shape=N)
     tot = ti.field(dtype, shape=())
 
-    @ti.kernel
-    def fill():
-        for i in a:
-            a[i] = i
+    if dtype in [ti.f32, ti.f64]:
+
+        @ti.kernel
+        def fill():
+            for i in a:
+                a[i] = i + 0.5
+    else:
+
+        @ti.kernel
+        def fill():
+            for i in a:
+                a[i] = i
 
     ti_op = ti_ops[op]
 
@@ -62,7 +70,8 @@ def reduce_tmp() -> dtype:
     reduce()
     tot2 = reduce_tmp()
 
-    ground_truth = np_ops[op](a.to_numpy())
+    np_arr = np.append(a.to_numpy(), [0])
+    ground_truth = np_ops[op](np_arr)
 
     assert criterion(tot[None], ground_truth)
     assert criterion(tot2, ground_truth)
diff --git a/tests/python/test_sparse_basics.py b/tests/python/test_sparse_basics.py
index 2cece4351..c6b27d73b 100644
--- a/tests/python/test_sparse_basics.py
+++ b/tests/python/test_sparse_basics.py
@@ -1,3 +1,5 @@
+import pytest
+
 import taichi as ti
 
 
@@ -75,3 +77,19 @@ def func():
     assert s[None] == 5 * n
     print(x[257 + n * n * 7])
     assert s[None] == 5 * n
+
+
+@pytest.mark.skip(reason='https://github.com/taichi-dev/taichi/issues/2520')
+@ti.test(require=ti.extension.sparse, use_unified_memory=False)
+def test_pointer_direct_place():
+    x, y = ti.field(ti.i32), ti.field(ti.i32)
+
+    N = 1
+    ti.root.pointer(ti.i, N).place(x)
+    ti.root.pointer(ti.i, N).place(y)
+
+    @ti.kernel
+    def foo():
+        pass
+
+    foo()
diff --git a/tests/python/test_tensor_reflection.py b/tests/python/test_tensor_reflection.py
index 5bac5cdbb..cdae5af42 100644
--- a/tests/python/test_tensor_reflection.py
+++ b/tests/python/test_tensor_reflection.py
@@ -59,7 +59,7 @@ def test_unordered():
     assert val.snode in blk3.get_children()
     assert blk3 in blk2.get_children()
     assert blk2 in blk1.get_children()
-    assert blk1 in ti.root.get_children()
+    assert blk1 in ti.FieldsBuilder.finalized_roots()[0].get_children()
 
     expected_str = f'ti.root => dense {[n]} => dense {[n, m]}' \
         f' => dense {[n, m, p]} => place {[n, m, p]}'