From 00c92f22997b9986c5a584c52ca8153a52a4dab2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 19 Dec 2024 04:54:23 +0000
Subject: [PATCH] Add: Apple Neural Engine optimizations

Co-authored-by: Kirill Solodskikh <23156352+ArnoldMSU@users.noreply.github.com>
Co-authored-by: Azim Kurbanov <33081108+b1n0@users.noreply.github.com>
Co-authored-by: Ruslan Aydarkhanov <7030684+Aydarkhan@users.noreply.github.com>
Co-authored-by: Andrey Ageev <36598717+AndreyAgeev@users.noreply.github.com>
---
 swift/Encoders.swift | 52 ++++++++++++++++++++++++++++++--------------
 swift/README.md      | 30 +++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index ebfcd73..64b69f1 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -94,20 +94,26 @@ func readConfig(fromPath path: String) throws -> [String: Any] {
 }
 
 /// Compiles and loads a machine learning model from a URL.
-/// - Parameter modelURL: The URL where the model package is located.
+/// - Parameters:
+///   - modelURL: The URL where the model package is located.
+///   - computeUnits: The hardware devices to use for model computation.
 /// - Returns: An instance of `MLModel`.
-func readModel(fromURL modelURL: URL) throws -> MLModel {
+func readModel(fromURL modelURL: URL, computeUnits: MLComputeUnits = .all) throws -> MLModel {
     let compiledModelURL = try MLModel.compileModel(at: modelURL)
-    return try MLModel(contentsOf: compiledModelURL)
+    let config = MLModelConfiguration()
+    config.computeUnits = computeUnits
+    return try MLModel(contentsOf: compiledModelURL, configuration: config)
 }
 
 /// Loads a machine learning model from a local file path.
-/// - Parameter path: The file path where the model file is located.
+/// - Parameters:
+///   - path: The file path where the model file is located.
+///   - computeUnits: The hardware devices to use for model computation.
 /// - Returns: An instance of `MLModel`.
-func readModel(fromPath path: String) throws -> MLModel {
+func readModel(fromPath path: String, computeUnits: MLComputeUnits = .all) throws -> MLModel {
     let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
     let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
-    return try readModel(fromURL: modelURL)
+    return try readModel(fromURL: modelURL, computeUnits: computeUnits)
 }
 
 /// Encodes text input into embeddings using a machine learning model.
@@ -120,10 +126,16 @@ public class TextEncoder {
     ///   - modelPath: The path to the directory containing the machine learning model.
     ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
     ///   - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory.
-    public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
+    ///   - computeUnits: The hardware devices to use for model computation. Use `.cpuAndNeuralEngine` for best performance.
+    public init(
+        modelPath: String,
+        configPath: String? = nil,
+        tokenizerPath: String? = nil,
+        computeUnits: MLComputeUnits = .all
+    ) throws {
         let finalConfigPath = configPath ?? modelPath + "/config.json"
         let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
-        self.model = try readModel(fromPath: modelPath)
+        self.model = try readModel(fromPath: modelPath, computeUnits: computeUnits)
         self.processor = try TextProcessor(
             configPath: finalConfigPath,
             tokenizerPath: finalTokenizerPath,
@@ -135,16 +147,20 @@ public class TextEncoder {
     /// - Parameters:
     ///   - modelName: The identifier for the model repository.
     ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
-    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+    ///   - computeUnits: The hardware devices to use for model computation. Use `.cpuAndNeuralEngine` for best performance.
+    public init(modelName: String, hubApi: HubApi = .shared, computeUnits: MLComputeUnits = .all) async throws {
         let repo = Hub.Repo(id: modelName)
+        let encoderMask =
+            computeUnits == .cpuAndNeuralEngine ? "text_encoder_neural.mlpackage" : "text_encoder.mlpackage"
         let modelURL = try await hubApi.snapshot(
             from: repo,
-            matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"]
+            matching: ["\(encoderMask)/*", "config.json", "tokenizer.json"]
         )
         let configPath = modelURL.appendingPathComponent("config.json").path
         let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
         self.model = try readModel(
-            fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)
+            fromURL: modelURL.appendingPathComponent(encoderMask, isDirectory: true),
+            computeUnits: computeUnits
         )
         self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
     }
@@ -174,9 +190,9 @@ public class ImageEncoder {
     /// - Parameters:
     ///   - modelPath: The path to the directory containing the machine learning model.
     ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
-    public init(modelPath: String, configPath: String? = nil) throws {
+    public init(modelPath: String, configPath: String? = nil, computeUnits: MLComputeUnits = .all) throws {
         let finalConfigPath = configPath ?? modelPath + "/config.json"
-        self.model = try readModel(fromPath: modelPath)
+        self.model = try readModel(fromPath: modelPath, computeUnits: computeUnits)
         self.processor = try ImageProcessor(configPath: finalConfigPath)
     }
 
@@ -184,12 +200,16 @@ public class ImageEncoder {
     /// - Parameters:
     ///   - modelName: The identifier for the model repository.
     ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
-    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+    ///   - computeUnits: The hardware devices to use for model computation. Use `.cpuAndNeuralEngine` for best performance.
+    public init(modelName: String, hubApi: HubApi = .shared, computeUnits: MLComputeUnits = .all) async throws {
         let repo = Hub.Repo(id: modelName)
-        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
+        let encoderMask =
+            computeUnits == .cpuAndNeuralEngine ? "image_encoder_neural.mlpackage" : "image_encoder.mlpackage"
+        let modelURL = try await hubApi.snapshot(from: repo, matching: ["\(encoderMask)/*", "config.json"])
         let configPath = modelURL.appendingPathComponent("config.json").path
         self.model = try readModel(
-            fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)
+            fromURL: modelURL.appendingPathComponent(encoderMask, isDirectory: true),
+            computeUnits: computeUnits
         )
         self.processor = try ImageProcessor(configPath: configPath)
     }
diff --git a/swift/README.md b/swift/README.md
index 8fa0eb8..e6b7a3d 100644
--- a/swift/README.md
+++ b/swift/README.md
@@ -19,7 +19,10 @@ import UForm
 ### Text Embeddings
 
 ```swift
-let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let textModel = try await TextEncoder(
+    modelName: "unum-cloud/uform3-image-text-english-small",
+    computeUnits: .cpuAndNeuralEngine
+)
 let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
 let textEmbedding: Embedding = try textModel.encode(text)
 let textVector: [Float32] = textEmbedding.asFloats()
@@ -28,7 +31,10 @@ let textVector: [Float32] = textEmbedding.asFloats()
 ### Image Embeddings
 
 ```swift
-let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let imageModel = try await ImageEncoder(
+    modelName: "unum-cloud/uform3-image-text-english-small",
+    computeUnits: .cpuAndNeuralEngine
+)
 let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
 guard let url = URL(string: imageURL),
     let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
@@ -40,6 +46,26 @@ var imageEmbedding: Embedding = try imageModel.encode(cgImage)
 var imageVector: [Float32] = embedding.asFloats()
 ```
 
+### Choosing Target Device
+
+Apple chips provide several functional units capable of high-throughput matrix multiplication and AI inference.
+Those `computeUnits` include the CPU, GPU, and Neural Engine.
+For maximum compatibility, the `.all` option is used by default.
+Sadly, Apple's scheduler is not always optimal, and it might be beneficial to specify the target device explicitly, especially if the models are pre-compiled for the Apple Neural Engine, as it may yield significant performance gains.
+
+| Model               | GPU Text E. | ANE Text E. | GPU Image E. | ANE Image E. |
+| :------------------ | ----------: | ----------: | -----------: | -----------: |
+| `english-small`     |     2.53 ms |     0.53 ms |      6.57 ms |      1.23 ms |
+| `english-base`      |     2.54 ms |     0.61 ms |     18.90 ms |      3.79 ms |
+| `english-large`     |     2.30 ms |     0.61 ms |     79.68 ms |     20.94 ms |
+| `multilingual-base` |     2.34 ms |     0.50 ms |     18.98 ms |      3.77 ms |
+
+> On Apple M4 iPad, running iOS 18.2.
+> Batch size is 1, and the model is pre-loaded into memory.
+> The original encoders use `f32` single-precision numbers for maximum compatibility, and mostly rely on __GPU__ for computation.
+> The quantized encoders use a mixture of `i8`, `f16`, and `f32` numbers for maximum performance, and mostly rely on the Apple Neural Engine (__ANE__) for computation.
+> The median latency is reported.
+
 ### Computing Distances
 
 There are several ways to compute distances between embeddings, once you have them.