taichi-dev · squarefk · Jul 9, 2021 · Jul 14, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
@@ -4,51 +4,9 @@ on:
     types: [opened, synchronize, reopened]
 
 jobs:
-  title_format:
-    name: Check PR Title
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-
-      - name: Run PR Title Checker
-        run: |
-          pip install semver GitPython
-          python misc/ci_check_pr_title.py "$PR_TITLE"
-        env:
-          PR_TITLE: ${{ github.event.pull_request.title }}
-
-  check_code_format:
-    name: Check Code Format
-    runs-on: ubuntu-latest
-    # This job will be required to pass before merging to master branch.
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      - name: Check code format
-        run: |
-          git config user.email "[email protected]"
-          git config user.name "Taichi Gardener"
-          git checkout -b _fake_squash
-          git remote add upstream https://github.com/taichi-dev/taichi.git
-          git fetch upstream master
-          sudo apt-get install clang-format
-          python3 -m pip install --user yapf gitpython colorama isort
-          python3 python/taichi/code_format.py
-          git checkout -b _enforced_format
-          git commit -am "enforce code format" || true
-          # exit with 1 if there were differences:
-          git diff _fake_squash _enforced_format --exit-code
-
   build_and_test_cpu_required:
     # This job will be required to pass before merging to master branch.
     name: Required Build and Test (CPU)
-    needs: check_code_format
     strategy:
       matrix:
         include:
@@ -95,7 +53,6 @@ jobs:
 
   build_and_test_cpu:
     name: Build and Test (CPU)
-    needs: build_and_test_cpu_required
     strategy:
       matrix:
         include:
@@ -156,91 +113,3 @@ jobs:
           ti test -vr2 -t2
         env:
           RUN_CPP_TESTS: ${{ matrix.with_cpp_tests }}
-
-  build_and_test_gpu_linux:
-    name: Build and Test (GPU)
-    needs: check_code_format
-    runs-on: [self-hosted, cuda, cn]
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Build
-        run: |
-          git --version
-          export PATH=/home/github/taichi-llvm/bin/:$PATH
-          export CXX=clang++-8
-          export PYTHON=/usr/bin/python3
-          $PYTHON misc/ci_setup.py ci
-        env:
-          CI_SETUP_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF
-
-      - name: Test
-        run: |
-          export PYTHON=/usr/bin/python3
-          export PATH=/home/github/taichi-llvm/bin/:$PATH
-          export PATH=$PATH:$HOME/.local/bin
-          export DISPLAY=:1
-          hash -r
-          glewinfo
-          $PYTHON examples/algorithm/laplace.py
-          ti diagnose
-          ti test -vr2 -t2
-
-  build_and_test_windows:
-    name: Build and Test (Windows)
-    needs: check_code_format
-    runs-on: windows-latest
-    steps:
-      - name: Install 7Zip PowerShell
-        shell: powershell
-        run: Install-Module 7Zip4PowerShell -Force -Verbose
-
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.7
-
-      - name: Add msbuild to PATH
-        uses: microsoft/[email protected]
-
-      - name: Build
-        shell: powershell
-        run: |
-          $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi"
-          $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python"
-          cd C:\
-          Remove-item alias:curl
-          curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
-          7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
-          curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
-          7z x clang-10.0.0-win.zip -otaichi_clang
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
-          $env:PATH += ";$env:TAICHI_REPO_DIR\bin"
-          clang --version
-          cd D:\a\taichi\taichi
-          python -m pip install numpy
-          python -m pip install pybind11
-          python misc/ci_setup.py ci
-          mkdir build
-          cd build
-          cmake .. -G"Visual Studio 16 2019" -A x64 -DPYTHON_EXECUTABLE="$env:PYTHON" -DLLVM_DIR="C:\taichi_llvm\lib\cmake\llvm"
-          msbuild /p:Configuration=RelWithDebInfo /p:Platform=x64 /m taichi.sln
-          cd ..
-        env:
-          PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe
-
-      - name: Test
-        shell: powershell
-        run: |
-          $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi"
-          $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python"
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
-          $env:PATH += ";$env:TAICHI_REPO_DIR\bin"
-          python -c "import taichi"
-          python examples/algorithm/laplace.py
-          python bin/taichi diagnose
-          python bin/taichi test -Cvr2 -t2
-        env:
-          PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -94,8 +94,8 @@ endif()
 foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
   add_custom_target(
       "generate_llvm_runtime_${arch}"
-      COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
-      COMMAND ${LLVM_AS_EXECUTABLE} runtime.ll -o "runtime_${arch}.bc"
+      COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime_${arch}.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
+      COMMAND ${LLVM_AS_EXECUTABLE} runtime_${arch}.ll -o "runtime_${arch}.bc"
       WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm"
   )
   add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")

diff --git a/taichi/analysis/bls_analyzer.cpp b/taichi/analysis/bls_analyzer.cpp
@@ -25,8 +25,7 @@ void BLSAnalyzer::generate_block_indices(SNode *snode, BlockIndices *indices) {
   // NOTE: Assuming not vectorized
   for (int i = 0; i < snode->num_active_indices; i++) {
     auto j = snode->physical_index_position[i];
-    indices->push_back(
-        {/*low=*/0, /*high=*/(1 << snode->extractors[j].num_bits) - 1});
+    indices->push_back({/*low=*/0, /*high=*/snode->extractors[j].shape - 1});
   }
 }
 

diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
@@ -816,9 +816,10 @@ class KernelManager::Impl {
         const auto &ext = sn->extractors[j];
         rtm_ext->extractors[j].num_bits = ext.num_bits;
         rtm_ext->extractors[j].acc_offset = ext.acc_offset;
-        rtm_ext->extractors[j].num_elements = ext.num_elements;
-        TI_DEBUG("  [{}] num_bits={} acc_offset={} num_elements={}", j,
-                 ext.num_bits, ext.acc_offset, ext.num_elements);
+        rtm_ext->extractors[j].num_elements_from_root =
+            ext.num_elements_from_root;
+        TI_DEBUG("  [{}] num_bits={} acc_offset={} num_elements_from_root={}",
+                 j, ext.num_bits, ext.acc_offset, ext.num_elements_from_root);
       }
       TI_DEBUG("");
     }

diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h
@@ -117,7 +117,7 @@ STR(
         int32_t start = 0;
         int32_t num_bits = 0;
         int32_t acc_offset = 0;
-        int32_t num_elements = 0;
+        int32_t num_elements_from_root = 0;
       };
 
       Extractor extractors[kTaichiMaxNumIndices];

diff --git a/taichi/backends/metal/struct_metal.cpp b/taichi/backends/metal/struct_metal.cpp
@@ -337,7 +337,7 @@ class StructCompiler {
     }
     sn_desc.total_num_elems_from_root = 1;
     for (const auto &e : sn->extractors) {
-      sn_desc.total_num_elems_from_root *= e.num_elements;
+      sn_desc.total_num_elems_from_root *= e.num_elements_from_root;
     }
 
     TI_ASSERT(snode_descriptors_.find(sn->id) == snode_descriptors_.end());

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
@@ -1673,15 +1673,18 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
 
     auto coord_object = RuntimeObject(kLLVMPhysicalCoordinatesName, this,
                                       builder.get(), new_coordinates);
-    for (int i = 0; i < snode->num_active_indices; i++) {
-      auto j = snode->physical_index_position[i];
-      if (!bit::is_power_of_two(snode->extractors[j].num_elements)) {
-        auto coord = coord_object.get("val", tlctx->get_constant(j));
-        exec_cond = builder->CreateAnd(
-            exec_cond,
-            builder->CreateICmp(
-                llvm::CmpInst::ICMP_SLT, coord,
-                tlctx->get_constant(snode->extractors[j].num_elements)));
+    if (!prog->config.packed) {
+      for (int i = 0; i < snode->num_active_indices; i++) {
+        auto j = snode->physical_index_position[i];
+        if (!bit::is_power_of_two(
+                snode->extractors[j].num_elements_from_root)) {
+          auto coord = coord_object.get("val", tlctx->get_constant(j));
+          exec_cond = builder->CreateAnd(
+              exec_cond, builder->CreateICmp(
+                             llvm::CmpInst::ICMP_SLT, coord,
+                             tlctx->get_constant(
+                                 snode->extractors[j].num_elements_from_root)));
+        }
       }
     }
 

diff --git a/taichi/ir/scratch_pad.h b/taichi/ir/scratch_pad.h
@@ -103,8 +103,7 @@ class ScratchPad {
     block_size.resize(dim);
     for (int i = 0; i < dim; i++) {
       block_size[i] =
-          1 << snode->parent->extractors[snode->physical_index_position[i]]
-                   .num_bits;
+          snode->parent->extractors[snode->physical_index_position[i]].shape;
       TI_ASSERT(bounds[i].low != std::numeric_limits<int>::max());
       TI_ASSERT(bounds[i].high != std::numeric_limits<int>::min());
     }

diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
@@ -2,6 +2,7 @@
 
 #include "taichi/ir/ir.h"
 #include "taichi/ir/statements.h"
+#include "taichi/program/program.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -34,23 +35,13 @@ SNode &SNode::create_node(std::vector<Index> indices,
                    "hashed node must be child of root due to initialization "
                    "memset limitation.");
   auto &new_node = insert_children(type);
-  new_node.n = 1;
-  for (int i = 0; i < sizes.size(); i++) {
-    auto s = sizes[i];
-    TI_ASSERT(sizes[i] > 0);
-    if (!bit::is_power_of_two(s)) {
-      auto promoted_s = bit::least_pot_bound(s);
-      TI_DEBUG("Non-power-of-two node size {} promoted to {}.", s, promoted_s);
-      s = promoted_s;
-    }
-    TI_ASSERT(bit::is_power_of_two(s));
-    new_node.n *= s;
-  }
   for (int i = 0; i < (int)indices.size(); i++) {
+    TI_ASSERT(sizes[i] > 0);
     auto &ind = indices[i];
     new_node.extractors[ind.value].activate(
         bit::log2int(bit::least_pot_bound(sizes[i])));
-    new_node.extractors[ind.value].num_elements = sizes[i];
+    new_node.extractors[ind.value].shape = sizes[i];
+    new_node.extractors[ind.value].num_elements_from_root = sizes[i];
   }
   return new_node;
 }
@@ -99,7 +90,7 @@ SNode *SNode::get_least_sparse_ancestor() const {
 
 int SNode::shape_along_axis(int i) const {
   const auto &extractor = extractors[physical_index_position[i]];
-  return extractor.num_elements;
+  return extractor.num_elements_from_root;
 }
 
 SNode::SNode() : SNode(0, SNodeType::undefined) {

diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
@@ -34,26 +34,34 @@ class Index {
  */
 struct IndexExtractor {
   /**
-   * Shape at the given index.
+   * Number of elements from root at this index.
    *
-   * This is the raw shape, *not* padded to power-of-two (POT).
+   * This is the raw number, *not* padded to power-of-two (POT).
    */
-  int num_elements{1};
+  int num_elements_from_root{1};
+  /**
+   * Shape at this index (POT or packed) according to the config.
+   */
+  int shape{1};
+  /**
+   * Accumulated shape from the last activated index to the first one.
+   */
+  int acc_shape{1};
   /**
    * Number of bits needed to store the coordinate at this index.
    *
-   * ceil(log2(num_elements))
+   * ceil(log2(shape))
    */
   int num_bits{0};
   /**
    * Accumulated offset from the last activated index to the first one.
    *
-   * This is the starting bit of this index in a linearized 1D coordiate. For
+   * This is the starting bit of this index in a linearized 1D coordinate. For
    * example, assuming an SNode of (ti.ijk, shape=(4, 8, 16)). ti.i takes 2
    * bits, ti.j 3 bits and ti.k 4 bits. Then for a linearized coordinate:
-   * ti.k uses bits [0, 3), acc_offset=0
-   * tk.j uses btis [3, 6), acc_offset=3
-   * ti.i uses bits [6, 8), acc_offset=6
+   * ti.k uses bits [0, 4), acc_offset=0
+   * ti.j uses bits [4, 7), acc_offset=4
+   * ti.i uses bits [7, 9), acc_offset=7
    */
   int acc_offset{0};
   /**
@@ -106,7 +114,7 @@ class SNode {
   int depth{0};
 
   std::string name;
-  int64 n{0};
+  int64 n{1};
   int total_num_bits{0};
   int total_bit_start{0};
   int chunk_size{0};
@@ -283,7 +291,7 @@ class SNode {
   }
 
   int64 max_num_elements() const {
-    return int64(1) << total_num_bits;
+    return n;
   }
 
   int shape_along_axis(int i) const;

diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h
@@ -106,7 +106,7 @@ bool replace_and_insert_statements(
 bool replace_statements(IRNode *root,
                         std::function<bool(Stmt *)> filter,
                         std::function<Stmt *(Stmt *)> finder);
-void demote_dense_struct_fors(IRNode *root);
+void demote_dense_struct_fors(IRNode *root, bool packed);
 bool demote_atomics(IRNode *root, const CompileConfig &config);
 void reverse_segments(IRNode *root);  // for autograd
 void detect_read_only(IRNode *root);

diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
@@ -8,6 +8,7 @@ CompileConfig::CompileConfig() {
   arch = host_arch();
   simd_width = default_simd_width(arch);
   external_optimization_level = 3;
+  packed = false;
   print_ir = false;
   print_accessor_ir = false;
   print_evaluator_ir = false;

diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
@@ -14,6 +14,7 @@ struct CompileConfig {
   bool lazy_compilation;
   int external_optimization_level;
   int max_vector_width;
+  bool packed;
   bool print_ir;
   bool print_accessor_ir;
   bool print_evaluator_ir;

diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
@@ -429,7 +429,7 @@ void Program::initialize_llvm_runtime_snodes(const SNodeTree *tree,
 
 int Program::add_snode_tree(std::unique_ptr<SNode> root) {
   const int id = snode_trees_.size();
-  auto tree = std::make_unique<SNodeTree>(id, std::move(root));
+  auto tree = std::make_unique<SNodeTree>(id, std::move(root), config.packed);
   tree->root()->set_snode_tree_id(id);
   materialize_snode_tree(tree.get());
   snode_trees_.push_back(std::move(tree));

diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -119,6 +119,7 @@ void export_lang(py::module &m) {
   py::class_<CompileConfig>(m, "CompileConfig")
       .def(py::init<>())
       .def_readwrite("arch", &CompileConfig::arch)
+      .def_readwrite("packed", &CompileConfig::packed)
       .def_readwrite("print_ir", &CompileConfig::print_ir)
       .def_readwrite("debug", &CompileConfig::debug)
       .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization)