Repo sync (#408)

secretflow · Nov 17, 2023 · 18c3b37 · 18c3b37
1 parent e0d6144
commit 18c3b37
Show file tree

Hide file tree

Showing 26 changed files with 250 additions and 193 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -57,8 +57,8 @@ build:linux-release --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-l%:libgcc.a
 build:macos --copt="-Xpreprocessor -fopenmp"
 build:macos --copt=-Wno-unused-command-line-argument
 build:macos --features=-supports_dynamic_linker
-build:macos --macos_minimum_os=11.0
-build:macos --host_macos_minimum_os=11.0
+build:macos --macos_minimum_os=12.0
+build:macos --host_macos_minimum_os=12.0
 
 build:linux --copt=-fopenmp
 build:linux --linkopt=-fopenmp

diff --git a/.circleci/continue-config.yml b/.circleci/continue-config.yml
@@ -100,7 +100,7 @@ jobs:
           path: test_logs.tar.gz
   macOS_ut:
     macos:
-      xcode: 14.3
+      xcode: 15.1
     resource_class: macos.m1.large.gen1
     steps:
       - checkout

diff --git a/.circleci/dev-release-config.yml b/.circleci/dev-release-config.yml
@@ -55,7 +55,7 @@ jobs:
             python3 -m twine upload -r pypi -u __token__ -p ${PYPI_TWINE_TOKEN} dist/*.whl
   macOS_arm64_publish:
     macos:
-      xcode: 14.3
+      xcode: 15.1
     resource_class: macos.m1.large.gen1
     steps:
       - checkout

diff --git a/.circleci/release-config.yml b/.circleci/release-config.yml
@@ -33,7 +33,7 @@ parameters:
 jobs:
   macOS_x64_publish:
     macos:
-      xcode: 14.3
+      xcode: 15.1
     resource_class: macos.x86.medium.gen2
     parameters:
       python_ver:
@@ -65,7 +65,7 @@ jobs:
             python3 -m twine upload -r pypi -u __token__ -p ${PYPI_TWINE_TOKEN} dist/*.whl
   macOS_arm64_publish:
     macos:
-      xcode: 14.3
+      xcode: 15.1
     resource_class: macos.m1.large.gen1
     parameters:
       python_ver:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@
 >
 > please add your unreleased change here.
 
+- [Deprecated] macOS 11.x is no longer supported
+
 ## 20231108
 
 - [Bugfix] Fix compatibility with latest Jax

diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -6,14 +6,14 @@
 
 SPU has been tested with the following settings:
 
-- Anolis OS 8.4 or later
+- Ubuntu 22.04
 - python3.8
 - 8c16g
 
 ### MacOS
 
 We have conducted some successful preliminary testings on
-macOS Monterey 12.4 with Intel processors and Apple Silicon.
+macOS Monterey 14.1 with Apple Silicon.
 
 ### Docker Image
 
@@ -47,3 +47,9 @@ pip install dist/*.whl --force-reinstall
 ```bash
 bazel clean --expunge
 ```
+
+#### Build with GPU support
+
+```bash
+export ENABLE_GPU_BUILD=1 && python setup.py bdist_wheel
+```
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # SPU: Secure Processing Unit
 
 [![CircleCI](https://dl.circleci.com/status-badge/img/gh/secretflow/spu/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/secretflow/spu/tree/main)
+![PyPI version](https://img.shields.io/pypi/v/spu)
 
 SPU (Secure Processing Unit) aims to be a `provable`, `measurable` secure computation device,
 which provides computation ability while keeping your private data protected.
@@ -25,8 +26,23 @@ This documentation also contains instructions for [build and testing](CONTRIBUTI
 
 ## Installation Guidelines
 
+### Supported platforms
+
+|            | Linux x86_64 | Linux aarch64 | macOS x86_64   | macOS Apple Silicon | Windows x86_64 | Windows WSL2 x86_64 |
+|------------|--------------|---------------|--------------|--------------|----------------|---------------------|
+| CPU        | yes          | no            | yes          | yes          | no             | yes                 |
+| NVIDIA GPU | experimental | no            | no           | n/a          | no             | no                  |
+
+### Instructions
+
 Please follow [Installation Guidelines](INSTALLATION.md) to install SPU.
 
+### Hardware Requirements
+
+| General Features | FourQ based PSI | GPU |
+| ---------------- | --------------- | --- |
+| AVX/ARMv8        | AVX2/ARMv8      | CUDA 11.8+ |
+
 ## Citing SPU
 
 If you think SPU helpful for your research or development, please consider citing our [paper](https://www.usenix.org/conference/atc23/presentation/ma):

diff --git a/docs/imgs/vm_arch.png b/docs/imgs/vm_arch.png
diff --git a/examples/python/ml/flax_llama7b/README.md b/examples/python/ml/flax_llama7b/README.md
@@ -22,7 +22,7 @@ This example demonstrates how to use SPU to run secure inference on a pre-traine
     Download trained LLaMA-B[PyTroch-Version] from [Hugging Face](https://huggingface.co/openlm-research/open_llama_7b)
     , and convert it to Flax.msgpack as:
 
-    ```sh 
+    ```sh
     cd path_to_EasyLM/EasyLM/models/llama
     python convert_hf_to_easylm.py  \
        --checkpoint_dir     path-to-flax-llama7b-dir    \

diff --git a/examples/python/ml/flax_llama7b_split/README.md b/examples/python/ml/flax_llama7b_split/README.md
@@ -79,7 +79,7 @@ This example demonstrates how to use SPU to run secure inference on a pre-traine
 4. Run `flax_llama7b_split` example
 
     ```sh
-    bazel run -c opt //examples/python/ml/flax_llama7b_split -- --model_path path-to-flax-llama7b-EasyLM.msgpack  --tokenizer_path path-to-flax-llama7b-dir --config `pwd`/examples/python/ml/flax_llama7b_split/3pc.json 
+    bazel run -c opt //examples/python/ml/flax_llama7b_split -- --model_path path-to-flax-llama7b-EasyLM.msgpack  --tokenizer_path path-to-flax-llama7b-dir --config `pwd`/examples/python/ml/flax_llama7b_split/3pc.json
     ```
 
     or（recommended）

diff --git a/examples/python/ml/haiku_lstm/requirements.txt b/examples/python/ml/haiku_lstm/requirements.txt
@@ -1,2 +1,2 @@
-dm-haiku
+dm-haiku==0.0.10
 plotnine
diff --git a/libspu/kernel/hal/BUILD.bazel b/libspu/kernel/hal/BUILD.bazel
@@ -260,9 +260,9 @@ spu_cc_library(
 )
 
 spu_cc_library(
-    name = "sort",
-    srcs = ["sort.cc"],
-    hdrs = ["sort.h"],
+    name = "permute",
+    srcs = ["permute.cc"],
+    hdrs = ["permute.h"],
     deps = [
         ":polymorphic",
         ":public_helper",

diff --git a/libspu/kernel/hal/sort.cc → libspu/kernel/hal/permute.cc b/libspu/kernel/hal/sort.cc → libspu/kernel/hal/permute.cc
@@ -1,7 +1,20 @@
-#include "libspu/kernel/hal/sort.h"
+// Copyright 2023 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/kernel/hal/permute.h"
 
 #include <algorithm>
-#include <future>
 
 #include "libspu/core/context.h"
 #include "libspu/kernel/hal/constants.h"
@@ -14,8 +27,19 @@
 #include "libspu/spu.pb.h"
 
 namespace spu::kernel::hal {
+
 namespace {
 
+// generate inverse permutation
+Index GenInvPerm(const Index &p) {
+  Index q(p.size());
+  const auto n = static_cast<int64_t>(p.size());
+  for (int64_t i = 0; i < n; ++i) {
+    q[p[i]] = i;
+  }
+  return q;
+}
+
 Value Permute1D(SPUContext *, const Value &x, const Index &indices) {
   SPU_ENFORCE(x.shape().size() == 1);
   return Value(x.data().linear_gather(indices), x.dtype());
@@ -389,7 +413,9 @@ spu::Value GenInvPerm(SPUContext *ctx, absl::Span<spu::Value const> inputs,
   SPU_ENFORCE_GT(bv.size(), 0U);
 
   // 2. generate natural permutation for initialization
-  auto init_perm = iota(ctx, spu::DT_I64, inputs[0].numel());
+  auto dt =
+      ctx->config().field() == FieldType::FM32 ? spu::DT_I32 : spu::DT_I64;
+  auto init_perm = iota(ctx, dt, inputs[0].numel());
   auto shared_perm = _p2s(ctx, init_perm);
 
   // 3. generate shared inverse permutation by bit vector and process
@@ -448,7 +474,7 @@ std::vector<spu::Value> ApplyInvPerm(SPUContext *ctx,
   // 4. <T> = SP(<SX>)
   std::vector<spu::Value> v;
   for (size_t i = 0; i < sx.size(); ++i) {
-    auto t = _inv_perm_sp(ctx, sx[i], m).setDtype(x[i].dtype());
+    auto t = _inv_perm_sp(ctx, sx[i], m);
     v.emplace_back(std::move(t));
   }
 
@@ -586,4 +612,79 @@ std::vector<spu::Value> simple_sort1d(SPUContext *ctx,
   }
 }
 
+std::vector<spu::Value> permute(SPUContext *ctx,
+                                absl::Span<const spu::Value> inputs,
+                                int64_t permute_dim,
+                                const Permute1dFn &permute_fn) {
+  // sanity check.
+  SPU_ENFORCE(!inputs.empty(), "Inputs should not be empty");
+  // put the to_permute dimension to the last dimension.
+  const Shape shape = inputs[0].shape();
+
+  // let
+  // - M is the number of inputs.
+  // - N is the number of vector to permute
+  // - W is the vector length.
+  const int64_t M = inputs.size();
+  const int64_t W = shape.dim(permute_dim);
+  if (W == 0) {
+    return std::vector<spu::Value>(inputs.begin(), inputs.end());
+  }
+  const int64_t N = shape.numel() / W;
+  Axes perm(shape.ndim());
+  Axes unperm;
+  {
+    // 2 ==> {0, 1, 4, 3, 2}
+    SPU_ENFORCE(permute_dim < shape.ndim());
+    std::iota(perm.begin(), perm.end(), 0);
+    std::swap(perm[permute_dim], perm.back());
+
+    auto q = GenInvPerm(Index(perm.begin(), perm.end()));
+    unperm = Axes(q.begin(), q.end());
+  }
+
+  Shape perm_shape(shape.begin(), shape.end());
+  std::swap(perm_shape[permute_dim], perm_shape.back());
+
+  // Do permute in 2-dimensions.
+  // First, reshape the input to (N, W)
+  std::vector<spu::Value> inputs2d;
+  for (auto const &input : inputs) {
+    auto transposed = hal::transpose(ctx, input, perm);
+    auto reshaped = hal::reshape(ctx, transposed, {N, W});
+    inputs2d.push_back(reshaped);
+  }
+
+  // Call permute1d for each dim to permute.
+  // results (N,M,W), each element is a vector with length W.
+  std::vector<std::vector<spu::Value>> permuted1d;
+  for (int64_t ni = 0; ni < N; ni++) {
+    // TODO: all these small permutations could be done in parallel.
+    std::vector<spu::Value> input_i;
+    input_i.reserve(inputs2d.size());
+    for (auto const &input : inputs2d) {
+      // we need 1-d tensor here
+      input_i.push_back(
+          hal::reshape(ctx, hal::slice(ctx, input, {ni, 0}, {ni + 1, W}), {W}));
+    }
+
+    permuted1d.push_back(permute_fn(input_i));
+  }
+
+  // result is (M,shape)
+  std::vector<spu::Value> results(M);
+  for (int64_t mi = 0; mi < M; mi++) {
+    std::vector<spu::Value> output2d;
+    for (int64_t ni = 0; ni < N; ni++) {
+      output2d.push_back(hal::reshape(ctx, permuted1d[ni][mi], {1, W}));
+    }
+    auto result = hal::concatenate(ctx, output2d, 0);
+    // Permute it back, final result is (M, shape)
+    result = hal::reshape(ctx, result, perm_shape);
+    results[mi] = hal::transpose(ctx, result, unperm);
+  }
+
+  return results;
+}
+
 }  // namespace spu::kernel::hal
diff --git a/libspu/kernel/hal/sort.h → libspu/kernel/hal/permute.h b/libspu/kernel/hal/sort.h → libspu/kernel/hal/permute.h
@@ -1,3 +1,17 @@
+// Copyright 2023 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once
 
 #include "absl/types/span.h"
@@ -12,6 +26,9 @@ namespace spu::kernel::hal {
 
 using CompFn = std::function<spu::Value(absl::Span<const spu::Value>)>;
 
+using Permute1dFn =
+    std::function<std::vector<spu::Value>(absl::Span<const spu::Value>)>;
+
 // sort direction for sorters without comparators
 enum class SortDirection {
   Ascending,
@@ -36,4 +53,11 @@ std::vector<spu::Value> simple_sort1d(SPUContext *ctx,
                                       SortDirection direction, int64_t num_keys,
                                       int64_t valid_bits);
 
+// transform n-d permute to 1-d permute and applying permute function to each
+// 1-d array
+std::vector<spu::Value> permute(SPUContext *ctx,
+                                absl::Span<const spu::Value> inputs,
+                                int64_t permute_dim,
+                                const Permute1dFn &permute_fn);
+
 }  // namespace spu::kernel::hal
diff --git a/libspu/kernel/hal/prot_wrapper.cc b/libspu/kernel/hal/prot_wrapper.cc
@@ -218,7 +218,7 @@ MAP_BINARY_OP(equal_pp)
     SPU_ENFORCE(x.shape().ndim() == 1, "x should be a 1-d tensor"); \
     auto ret = mpc::NAME(ctx, x, y);                                \
     SPU_ENFORCE(ret.has_value(), "{} api not implemented", #NAME);  \
-    return ret.value();                                             \
+    return ret.value().setDtype(x.dtype());                         \
   }  // namespace spu::kernel::hal
 
 MAP_OPTIONAL_PERM_OP(perm_ss);

diff --git a/libspu/kernel/hlo/BUILD.bazel b/libspu/kernel/hlo/BUILD.bazel
@@ -269,19 +269,16 @@ spu_cc_library(
     srcs = ["sort.cc"],
     hdrs = ["sort.h"],
     deps = [
-        ":basic_binary",
-        ":casting",
-        ":const",
-        ":utils",
-        "//libspu/kernel/hal:shape_ops",
-        "//libspu/kernel/hal:sort",
+        "//libspu/kernel/hal:permute",
     ],
 )
 
 spu_cc_test(
     name = "sort_test",
     srcs = ["sort_test.cc"],
     deps = [
+        ":casting",
+        ":const",
         ":sort",
         "//libspu/kernel:test_util",
         "//libspu/kernel/hal:polymorphic",
@@ -312,7 +309,10 @@ spu_cc_library(
     hdrs = ["shuffle.h"],
     deps = [
         ":sort",
+        "//libspu/core:context",
+        "//libspu/kernel/hal:permute",
         "//libspu/kernel/hal:polymorphic",
+        "//libspu/kernel/hal:prot_wrapper",
         "//libspu/kernel/hal:random",
     ],
 )