Switch from use_cuda_malloc flag to a general pytorch_cuda_alloc_conf…

… config field that allows full customization of the CUDA allocator.
invoke-ai · Feb 24, 2025 · 76430cb · 76430cb
1 parent 5f1de52
commit 76430cb
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 37 deletions.
diff --git a/invokeai/app/run_app.py b/invokeai/app/run_app.py
@@ -1,7 +1,7 @@
 import uvicorn
 
 from invokeai.app.services.config.config_default import get_config
-from invokeai.app.util.torch_cuda_allocator import enable_torch_cuda_malloc
+from invokeai.app.util.torch_cuda_allocator import configure_torch_cuda_allocator
 from invokeai.backend.util.logging import InvokeAILogger
 from invokeai.frontend.cli.arg_parser import InvokeAIArgs
 
@@ -27,8 +27,8 @@ def run_app() -> None:
 
     # Configure the torch CUDA memory allocator.
     # NOTE: It is important that this happens before torch is imported.
-    if app_config.use_cuda_malloc:
-        enable_torch_cuda_malloc()
+    if app_config.pytorch_cuda_alloc_conf:
+        configure_torch_cuda_allocator(app_config.pytorch_cuda_alloc_conf, logger)
 
     # Import from startup_utils here to avoid importing torch before enable_torch_cuda_malloc() is called.
     from invokeai.app.util.startup_utils import (

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
@@ -91,7 +91,7 @@ class InvokeAIAppConfig(BaseSettings):
         ram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
         vram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
         lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.
-        use_cuda_malloc: Use CUDA's built-in async memory allocator rather than PyTorch's native memory allocator. This can reduce VRAM usage and improve performance in many cases, but can cause a regression on certain systems.
+        pytorch_cuda_alloc_conf: Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to "backend:cudaMallocAsync" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
         precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
         sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
@@ -171,7 +171,7 @@ class InvokeAIAppConfig(BaseSettings):
     lazy_offload:                  bool = Field(default=True,               description="DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.")
 
     # PyTorch Memory Allocator
-    use_cuda_malloc:               bool = Field(default=False,              description="Use CUDA's built-in async memory allocator rather than PyTorch's native memory allocator. This can reduce VRAM usage and improve performance in many cases, but can cause a regression on certain systems.")
+    pytorch_cuda_alloc_conf: Optional[str] = Field(default=None,            description="Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to \"backend:cudaMallocAsync\" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.")
 
     # DEVICE
     device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")

diff --git a/invokeai/app/util/torch_cuda_allocator.py b/invokeai/app/util/torch_cuda_allocator.py
@@ -1,21 +1,12 @@
+import logging
 import os
 
 
-def is_torch_cuda_malloc_enabled():
-    """Check if the cudaMallocAsync memory allocator backend is being used."""
-    # NOTE: We do not import torch at the file level, because enable_torch_cuda_malloc() must be called before torch is
-    # imported.
-    import torch
-
-    if not torch.cuda.is_available():
-        return False
-
-    allocator_backend = torch.cuda.get_allocator_backend()
-    return allocator_backend == "cudaMallocAsync"
-
-
-def enable_torch_cuda_malloc():
-    """Configure the PyTorch CUDA memory allocator to use the cudaMallocAsync memory allocator backend."""
+def configure_torch_cuda_allocator(pytorch_cuda_alloc_conf: str, logger: logging.Logger | None = None):
+    """Configure the PyTorch CUDA memory allocator. See
+    https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf for supported
+    configurations.
+    """
 
     # Raise if the PYTORCH_CUDA_ALLOC_CONF environment variable is already set.
     prev_cuda_alloc_conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None)
@@ -25,9 +16,9 @@ def enable_torch_cuda_malloc():
             f"'{prev_cuda_alloc_conf}'."
         )
 
-    # Enable the cudaMallocAsync memory allocator backend.
+    # Configure the PyTorch CUDA memory allocator.
     # NOTE: It is important that this happens before torch is imported.
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = pytorch_cuda_alloc_conf
 
     import torch
 
@@ -37,9 +28,15 @@ def enable_torch_cuda_malloc():
             "Attempted to configure the PyTorch CUDA memory allocator, but no CUDA devices are available."
         )
 
-    # Confirm that the cudaMallocAsync memory allocator backend is now being used.
-    if not is_torch_cuda_malloc_enabled():
+    # Verify that the torch allocator was properly configured.
+    allocator_backend = torch.cuda.get_allocator_backend()
+    expected_backend = "cudaMallocAsync" if "cudaMallocAsync" in pytorch_cuda_alloc_conf else "native"
+    if allocator_backend != expected_backend:
         raise RuntimeError(
-            "Failed to enable the cudaMallocAsync memory allocator backend. This likely means that the torch memory "
-            "allocator was initialized before calling this function."
+            f"Failed to configure the PyTorch CUDA memory allocator. Expected backend: '{expected_backend}', but got "
+            f"'{allocator_backend}'. Verify that 1) the pytorch_cuda_alloc_conf is set correctly, and 2) that torch is "
+            "not imported before calling configure_torch_cuda_allocator()."
         )
+
+    if logger is not None:
+        logger.info(f"PyTorch CUDA memory allocator: {torch.cuda.get_allocator_backend()}")
diff --git a/tests/app/util/test_torch_cuda_allocator.py b/tests/app/util/test_torch_cuda_allocator.py
@@ -1,18 +1,11 @@
 import pytest
 
-from invokeai.app.util.torch_cuda_allocator import enable_torch_cuda_malloc, is_torch_cuda_malloc_enabled
+from invokeai.app.util.torch_cuda_allocator import configure_torch_cuda_allocator
 
 
-def test_is_torch_cuda_malloc_enabled():
-    """Test that if torch CUDA malloc hasn't been explicitly enabled, then is_torch_cuda_malloc_enabled() returns
-    False.
-    """
-    assert not is_torch_cuda_malloc_enabled()
-
-
-def test_enable_torch_cuda_malloc_raises_if_torch_is_already_imported():
+def test_configure_torch_cuda_allocator_raises_if_torch_is_already_imported():
     """Test that enable_torch_cuda_malloc() raises a RuntimeError if torch is already imported."""
     import torch  # noqa: F401
 
-    with pytest.raises(RuntimeError):
-        enable_torch_cuda_malloc()
+    with pytest.raises(RuntimeError, match="Failed to configure the PyTorch CUDA memory allocator."):
+        configure_torch_cuda_allocator("backend:cudaMallocAsync")