Update custom ops and GPU function recipes for the latest MAX nightli…

…es. (#16)
modular · Mar 6, 2025 · 2b568d7 · 2b568d7
1 parent 1387c37
commit 2b568d7
Show file tree

Hide file tree

Showing 15 changed files with 254 additions and 214 deletions.
diff --git a/custom-ops-ai-applications/benchmarks.mojo b/custom-ops-ai-applications/benchmarks.mojo
@@ -11,23 +11,91 @@
 # limitations under the License.
 # ===----------------------------------------------------------------------=== #
 
+from benchmark import ThroughputMeasure, BenchId, BenchMetric, Bench, Bencher
+from bit import log2_floor
+from buffer.dimlist import DimList
+from gpu.host import DeviceContext, DeviceBuffer
 from operations.top_k import TopK
-from gpu.host import DeviceContext
-from utils import IndexList
+from math import iota
 from max.driver import cpu
 from max.tensor import (
     ManagedTensorSlice,
     InputTensor,
     OutputTensor,
     StaticTensorSpec,
+    IOSpec,
+    Input,
+    Output,
+    MutableInput,
 )
-from random import rand
+from memory import AddressSpace
 from memory import UnsafePointer
+from random import rand
 from runtime.asyncrt import DeviceContextPtr
-from benchmark import ThroughputMeasure, BenchId, BenchMetric, Bench, Bencher
-from bit import log2_floor
 from sys import sizeof, has_nvidia_gpu_accelerator
-from memory import AddressSpace
+from utils import IndexList
+
+
+# Wrap a ManagedTensorSlice with a DeviceBuffer which has a lifetime to use
+# Mojo's memory management, and sidestep the Python initialized garbage
+# collected version.
+@value
+struct _BenchTensor[
+    dtype: DType,
+    rank: Int, //,
+    io_spec: IOSpec,
+    static_spec: StaticTensorSpec[dtype, rank],
+]:
+    alias tensor_type = ManagedTensorSlice[
+        io_spec=io_spec, static_spec=static_spec
+    ]
+    alias buffer_type = DeviceBuffer[dtype]
+    alias ptr_type = UnsafePointer[Scalar[dtype]]
+    alias size = Int(static_spec.shape.product())
+
+    var tensor: Self.tensor_type
+    var buffer: Self.buffer_type
+
+    fn __init__(out self, ctx: DeviceContext) raises:
+        self.buffer = ctx.enqueue_create_buffer[dtype](Self.size)
+
+        self.tensor = ManagedTensorSlice[
+            io_spec=io_spec, static_spec=static_spec
+        ](
+            self.buffer.unsafe_ptr(),
+            Self.static_spec.shape.into_index_list[rank](),
+            Self.static_spec.strides.into_index_list[rank](),
+        )
+
+    fn unsafe_ptr(self) -> Self.ptr_type:
+        return self.buffer.unsafe_ptr()
+
+    fn rand(self) raises -> Self:
+        with self.buffer.map_to_host() as host_buffer:
+            rand(host_buffer.unsafe_ptr(), Self.size)
+            return self
+
+    fn iota(self) raises -> Self:
+        with self.buffer.map_to_host() as host_buffer:
+            iota(host_buffer.unsafe_ptr(), Self.size)
+            return self
+
+
+# TODO: Change StaticTensorSpec to use `IndexList` instead of `DimList` in order
+# to determine strides from shape at compile time, and align with
+# RuntimeTensorSpec.
+fn _static_spec[
+    dtype: DType, rank: Int
+](shape: DimList, strides: DimList, out spec: StaticTensorSpec[dtype, rank]):
+    spec = __type_of(spec)(
+        shape=shape,
+        strides=strides,
+        alignment=sizeof[dtype](),
+        address_space=AddressSpace.GENERIC,
+        exclusive=True,
+        in_lambda=None,
+        out_lambda=None,
+    )
 
 
 def top_k():
@@ -41,39 +109,24 @@ def top_k():
 
     # Slightly better performance compared to `create_unknown`. Using global
     # address space doesn't improve perf for GPU.
-    alias val_spec = StaticTensorSpec[val_dtype, rank](
-        shape=(batch_size, K),
-        strides=(K, 1),
-        alignment=sizeof[val_dtype](),
-        address_space=AddressSpace.GENERIC,
-        exclusive=True,
-        in_lambda=None,
-        out_lambda=None,
-    )
-    alias idx_spec = StaticTensorSpec[idx_dtype, rank](
-        shape=(batch_size, K),
-        strides=(K, 1),
-        alignment=sizeof[idx_dtype](),
-        address_space=AddressSpace.GENERIC,
-        exclusive=True,
-        in_lambda=None,
-        out_lambda=None,
-    )
 
-    var in_vals = InputTensor[static_spec=val_spec].rand()
-    var out_vals = OutputTensor[static_spec=val_spec].rand()
-    var out_idxs = OutputTensor[static_spec=idx_spec].rand()
+    alias val_spec = _static_spec[val_dtype, rank]((batch_size, K), (K, 1))
+    alias idx_spec = _static_spec[idx_dtype, rank]((batch_size, K), (K, 1))
 
     var cpu_ctx = DeviceContext(api="cpu")
 
+    var in_vals = _BenchTensor[Input, val_spec](cpu_ctx).rand()
+    var out_vals = _BenchTensor[Output, val_spec](cpu_ctx).rand()
+    var out_idxs = _BenchTensor[Output, idx_spec](cpu_ctx).rand()
+
     @parameter
     @always_inline
     fn bench_cpu(mut b: Bencher) raises:
         @parameter
         @always_inline
         fn run_bench() raises:
             TopK.execute[K=K, target="cpu"](
-                out_vals, out_idxs, in_vals, cpu_ctx
+                out_vals.tensor, out_idxs.tensor, in_vals.tensor, cpu_ctx
             )
 
         b.iter[run_bench]()
@@ -88,21 +141,9 @@ def top_k():
     if has_nvidia_gpu_accelerator():
         var gpu_ctx = DeviceContext()
 
-        var in_vals_dev_buff = gpu_ctx.enqueue_create_buffer[val_dtype](els)
-        var out_vals_dev_buff = gpu_ctx.enqueue_create_buffer[val_dtype](els)
-        var out_idxs_dev_buff = gpu_ctx.enqueue_create_buffer[idx_dtype](els)
-
-        gpu_ctx.enqueue_copy(in_vals_dev_buff, in_vals.unsafe_ptr())
-
-        var out_vals_dev = OutputTensor[static_spec=val_spec](
-            out_vals_dev_buff.unsafe_ptr(), shape
-        )
-        var out_idxs_dev = OutputTensor[static_spec=idx_spec](
-            out_idxs_dev_buff.unsafe_ptr(), shape
-        )
-        var in_vals_dev = InputTensor[static_spec=val_spec](
-            in_vals_dev_buff.unsafe_ptr(), shape
-        )
+        var out_vals_dev = _BenchTensor[Output, val_spec](gpu_ctx).rand()
+        var out_idxs_dev = _BenchTensor[Output, idx_spec](gpu_ctx).rand()
+        var in_vals_dev = _BenchTensor[Input, val_spec](gpu_ctx).rand()
 
         @parameter
         @always_inline
@@ -111,25 +152,20 @@ def top_k():
             @always_inline
             fn kernel_launch(gpu_ctx: DeviceContext) raises:
                 TopK.execute[K=K, target="gpu"](
-                    out_vals_dev, out_idxs_dev, in_vals_dev, gpu_ctx
+                    out_vals_dev.tensor,
+                    out_idxs_dev.tensor,
+                    in_vals_dev.tensor,
+                    gpu_ctx,
                 )
 
             b.iter_custom[kernel_launch](gpu_ctx)
 
         b.bench_function[bench_gpu](
             BenchId("top_k_custom", "gpu"), flops, elements
         )
-        _ = in_vals_dev_buff
-        _ = out_vals_dev_buff
-        _ = out_idxs_dev_buff
-
     b.config.verbose_metric_names = False
     print(b)
 
-    _ = in_vals
-    _ = out_vals
-    _ = out_idxs
-
 
 def main():
     top_k()
diff --git a/custom-ops-ai-applications/operations/fused_attention.mojo b/custom-ops-ai-applications/operations/fused_attention.mojo
@@ -63,12 +63,11 @@ from gpu.id import block_idx, thread_idx
 from gpu.sync import barrier
 from gpu.memory import AddressSpace
 from runtime.asyncrt import DeviceContextPtr
+from tensor import OutputTensor, InputTensor
 from utils import Index
 
-from tensor import ManagedTensorSlice
 
-
-@register("fused_attention_custom", num_dps_outputs=1)
+@register("fused_attention_custom")
 struct FusedAttention:
     """Registers the `fused_attention_custom` op, allowing python to use it from the `max`
     package.
@@ -85,10 +84,10 @@ struct FusedAttention:
         BD: Int,  # Dimension of blocks to split K, V into
         target: StringLiteral,  # "cpu" or "gpu"
     ](
-        output: ManagedTensorSlice[type=dtype, rank=rank],
-        key: ManagedTensorSlice[type=dtype, rank=rank],
-        query: ManagedTensorSlice[type=dtype, rank=rank],
-        value: ManagedTensorSlice[type=dtype, rank=rank],
+        output: OutputTensor[type=dtype, rank=rank],
+        key: InputTensor[type=dtype, rank=rank],
+        query: InputTensor[type=dtype, rank=rank],
+        value: InputTensor[type=dtype, rank=rank],
         ctx: DeviceContextPtr,
     ) raises:
         constrained[rank == 2, "rank must be 2"]()
@@ -279,7 +278,7 @@ fn matmul[
         res.copy_from(out_sram)
 
 
-fn fused_attention_kenel[
+fn fused_attention_kernel[
     q_dtype: DType,
     q_layout: Layout,
     k_dtype: DType,
@@ -339,7 +338,7 @@ def fused_attention_gpu[
     V: LayoutTensor,
     mut O: LayoutTensor,
 ):
-    alias kernel_func = fused_attention_kenel[
+    alias kernel_func = fused_attention_kernel[
         Q.dtype,
         Q.layout,
         K.dtype,

diff --git a/custom-ops-ai-applications/operations/top_k.mojo b/custom-ops-ai-applications/operations/top_k.mojo
@@ -19,7 +19,7 @@ from compiler import register
 from gpu import WARP_SIZE, barrier, warp
 from gpu.id import block_dim, block_idx, thread_idx
 from gpu.memory import AddressSpace, external_memory
-from max.tensor import ManagedTensorSlice
+from max.tensor import OutputTensor, InputTensor
 from memory import Span
 from runtime.asyncrt import DeviceContextPtr
 from utils.index import IndexList
@@ -38,7 +38,7 @@ struct TopKElement[T: DType]:
         return self.val > rhs.val
 
 
-@register("top_k_custom", num_dps_outputs=2)
+@register("top_k_custom")
 struct TopK:
     """Registers the `top_k_custom` op, allowing python to use it from the `max`
     package. This is a simplified version without bottom_k and sorting options,
@@ -55,9 +55,9 @@ struct TopK:
         K: Int,
         target: StringLiteral,
     ](
-        out_vals: ManagedTensorSlice[type=type, rank=rank],
-        out_idxs: ManagedTensorSlice[type = DType.int32, rank=rank],
-        in_vals: ManagedTensorSlice[type=type, rank=rank],
+        out_vals: OutputTensor[type=type, rank=rank],
+        out_idxs: OutputTensor[type = DType.int32, rank=rank],
+        in_vals: InputTensor[type=type, rank=rank],
         ctx: DeviceContextPtr,
     ) raises:
         constrained[rank == 2, "rank must be 2"]()

diff --git a/custom-ops-ai-applications/top_k.py b/custom-ops-ai-applications/top_k.py
@@ -11,7 +11,6 @@
 # limitations under the License.
 # ===----------------------------------------------------------------------=== #
 import argparse
-import os
 from collections import defaultdict
 from pathlib import Path
 from typing import DefaultDict

diff --git a/custom-ops-introduction/README.md b/custom-ops-introduction/README.md
@@ -122,8 +122,8 @@ struct AddOne:
     fn execute[
         target: StringLiteral,
     ](
-        out: ManagedTensorSlice,
-        x: ManagedTensorSlice[type = out.type, rank = out.rank],
+        out: OutputTensor,
+        x: InputTensor[type = out.type, rank = out.rank],
         ctx: DeviceContextPtr,
     ):
         @parameter

diff --git a/custom-ops-introduction/operations/add_one.mojo b/custom-ops-introduction/operations/add_one.mojo
@@ -12,7 +12,7 @@
 # ===----------------------------------------------------------------------=== #
 
 import compiler
-from max.tensor import ManagedTensorSlice, foreach
+from tensor import OutputTensor, InputTensor, foreach
 from runtime.asyncrt import DeviceContextPtr
 
 from utils.index import IndexList
@@ -26,12 +26,12 @@ struct AddOne:
         target: StringLiteral,
     ](
         # as num_dps_outputs=1, the first argument is the "output"
-        out: ManagedTensorSlice,
+        out: OutputTensor,
         # starting here are the list of inputs
-        x: ManagedTensorSlice[type = out.type, rank = out.rank],
+        x: InputTensor[type = out.type, rank = out.rank],
         # the context is needed for some GPU calls
         ctx: DeviceContextPtr,
-    ):
+    ) raises:
         @parameter
         @always_inline
         fn elementwise_add_one[

diff --git a/custom-ops-introduction/operations/mandelbrot.mojo b/custom-ops-introduction/operations/mandelbrot.mojo
@@ -15,7 +15,7 @@
 import compiler
 from complex import ComplexSIMD
 from math import iota
-from max.tensor import ManagedTensorSlice, foreach
+from tensor import OutputTensor, InputTensor, foreach
 from runtime.asyncrt import DeviceContextPtr
 from utils.index import IndexList
 
@@ -31,7 +31,7 @@ struct Mandelbrot:
         target: StringLiteral,
     ](
         # as num_dps_outputs=1, the first argument is the "output"
-        out: ManagedTensorSlice,
+        out: OutputTensor,
         # starting here are the list of inputs
         min_x: Float32,
         min_y: Float32,
@@ -40,7 +40,7 @@ struct Mandelbrot:
         max_iterations: Int32,
         # the context is needed for some GPU calls
         ctx: DeviceContextPtr,
-    ):
+    ) raises:
         @parameter
         @always_inline
         fn elementwise_mandelbrot[

diff --git a/custom-ops-introduction/operations/vector_addition.mojo b/custom-ops-introduction/operations/vector_addition.mojo
@@ -17,15 +17,15 @@ import compiler
 from gpu import block_dim, block_idx, thread_idx
 from gpu.host import DeviceContext
 from runtime.asyncrt import DeviceContextPtr
-from tensor import ManagedTensorSlice, foreach
+from tensor import OutputTensor, InputTensor, foreach
 
 from utils.index import IndexList
 
 
 fn vector_addition_cpu(
-    out: ManagedTensorSlice,
-    lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
-    rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    out: OutputTensor,
+    lhs: InputTensor[type = out.type, rank = out.rank],
+    rhs: InputTensor[type = out.type, rank = out.rank],
     ctx: DeviceContextPtr,
 ):
     # Warning: This is an extremely inefficient implementation! It's merely an
@@ -37,9 +37,9 @@ fn vector_addition_cpu(
 
 
 fn vector_addition_gpu(
-    out: ManagedTensorSlice,
-    lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
-    rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    out: OutputTensor,
+    lhs: InputTensor[type = out.type, rank = out.rank],
+    rhs: InputTensor[type = out.type, rank = out.rank],
     ctx: DeviceContextPtr,
 ) raises:
     # Note: The following has not been tuned for any GPU hardware, and is an
@@ -75,10 +75,10 @@ struct VectorAddition:
         target: StringLiteral,
     ](
         # as num_dps_outputs=1, the first argument is the "output"
-        out: ManagedTensorSlice[rank=1],
+        out: OutputTensor[rank=1],
         # starting here are the list of inputs
-        lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
-        rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+        lhs: InputTensor[type = out.type, rank = out.rank],
+        rhs: InputTensor[type = out.type, rank = out.rank],
         # the context is needed for some GPU calls
         ctx: DeviceContextPtr,
     ) raises: