diff --git a/invokeai/app/api/routers/model_manager.py b/invokeai/app/api/routers/model_manager.py
index 191db741ae0..16dc51a29f1 100644
--- a/invokeai/app/api/routers/model_manager.py
+++ b/invokeai/app/api/routers/model_manager.py
@@ -4,7 +4,6 @@
 import contextlib
 import io
 import pathlib
-import shutil
 import traceback
 from copy import deepcopy
 from enum import Enum
@@ -21,7 +20,6 @@
 from typing_extensions import Annotated
 
 from invokeai.app.api.dependencies import ApiDependencies
-from invokeai.app.services.config import get_config
 from invokeai.app.services.model_images.model_images_common import ModelImageFileNotFoundException
 from invokeai.app.services.model_install.model_install_common import ModelInstallJob
 from invokeai.app.services.model_records import (
@@ -848,74 +846,6 @@ async def get_starter_models() -> StarterModelResponse:
     return StarterModelResponse(starter_models=starter_models, starter_bundles=starter_bundles)
 
 
-@model_manager_router.get(
-    "/model_cache",
-    operation_id="get_cache_size",
-    response_model=float,
-    summary="Get maximum size of model manager RAM or VRAM cache.",
-)
-async def get_cache_size(cache_type: CacheType = Query(description="The cache type", default=CacheType.RAM)) -> float:
-    """Return the current RAM or VRAM cache size setting (in GB)."""
-    cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
-    value = 0.0
-    if cache_type == CacheType.RAM:
-        value = cache.max_cache_size
-    elif cache_type == CacheType.VRAM:
-        value = cache.max_vram_cache_size
-    return value
-
-
-@model_manager_router.put(
-    "/model_cache",
-    operation_id="set_cache_size",
-    response_model=float,
-    summary="Set maximum size of model manager RAM or VRAM cache, optionally writing new value out to invokeai.yaml config file.",
-)
-async def set_cache_size(
-    value: float = Query(description="The new value for the maximum cache size"),
-    cache_type: CacheType = Query(description="The cache type", default=CacheType.RAM),
-    persist: bool = Query(description="Write new value out to invokeai.yaml", default=False),
-) -> float:
-    """Set the current RAM or VRAM cache size setting (in GB). ."""
-    cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
-    app_config = get_config()
-    # Record initial state.
-    vram_old = app_config.vram
-    ram_old = app_config.ram
-
-    # Prepare target state.
-    vram_new = vram_old
-    ram_new = ram_old
-    if cache_type == CacheType.RAM:
-        ram_new = value
-    elif cache_type == CacheType.VRAM:
-        vram_new = value
-    else:
-        raise ValueError(f"Unexpected {cache_type=}.")
-
-    config_path = app_config.config_file_path
-    new_config_path = config_path.with_suffix(".yaml.new")
-
-    try:
-        # Try to apply the target state.
-        cache.max_vram_cache_size = vram_new
-        cache.max_cache_size = ram_new
-        app_config.ram = ram_new
-        app_config.vram = vram_new
-        if persist:
-            app_config.write_file(new_config_path)
-            shutil.move(new_config_path, config_path)
-    except Exception as e:
-        # If there was a failure, restore the initial state.
-        cache.max_cache_size = ram_old
-        cache.max_vram_cache_size = vram_old
-        app_config.ram = ram_old
-        app_config.vram = vram_old
-
-        raise RuntimeError("Failed to update cache size") from e
-    return value
-
-
 @model_manager_router.get(
     "/stats",
     operation_id="get_stats",
diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index f3686ae6488..d73709d8e86 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -63,9 +63,6 @@ class CompelInvocation(BaseInvocation):
 
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> ConditioningOutput:
-        tokenizer_info = context.models.load(self.clip.tokenizer)
-        text_encoder_info = context.models.load(self.clip.text_encoder)
-
         def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
             for lora in self.clip.loras:
                 lora_info = context.models.load(lora.lora)
@@ -76,12 +73,13 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
 
         # loras = [(context.models.get(**lora.dict(exclude={"weight"})).context.model, lora.weight) for lora in self.clip.loras]
 
+        text_encoder_info = context.models.load(self.clip.text_encoder)
         ti_list = generate_ti_list(self.prompt, text_encoder_info.config.base, context)
 
         with (
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
-            tokenizer_info as tokenizer,
+            context.models.load(self.clip.tokenizer) as tokenizer,
             LayerPatcher.apply_smart_model_patches(
                 model=text_encoder,
                 patches=_lora_loader(),
@@ -140,9 +138,7 @@ def run_clip_compel(
         lora_prefix: str,
         zero_on_empty: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        tokenizer_info = context.models.load(clip_field.tokenizer)
         text_encoder_info = context.models.load(clip_field.text_encoder)
-
         # return zero on empty
         if prompt == "" and zero_on_empty:
             cpu_text_encoder = text_encoder_info.model
@@ -180,7 +176,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
         with (
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
-            tokenizer_info as tokenizer,
+            context.models.load(clip_field.tokenizer) as tokenizer,
             LayerPatcher.apply_smart_model_patches(
                 model=text_encoder,
                 patches=_lora_loader(),
@@ -226,7 +222,6 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
 
         del tokenizer
         del text_encoder
-        del tokenizer_info
         del text_encoder_info
 
         c = c.detach().to("cpu")
diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py
index 5aeeff57ad5..1e37e54e8ae 100644
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -547,7 +547,6 @@ def prep_ip_adapter_image_prompts(
         for single_ip_adapter in ip_adapters:
             with context.models.load(single_ip_adapter.ip_adapter_model) as ip_adapter_model:
                 assert isinstance(ip_adapter_model, IPAdapter)
-                image_encoder_model_info = context.models.load(single_ip_adapter.image_encoder_model)
                 # `single_ip_adapter.image` could be a list or a single ImageField. Normalize to a list here.
                 single_ipa_image_fields = single_ip_adapter.image
                 if not isinstance(single_ipa_image_fields, list):
@@ -556,7 +555,7 @@ def prep_ip_adapter_image_prompts(
                 single_ipa_images = [
                     context.images.get_pil(image.image_name, mode="RGB") for image in single_ipa_image_fields
                 ]
-                with image_encoder_model_info as image_encoder_model:
+                with context.models.load(single_ip_adapter.image_encoder_model) as image_encoder_model:
                     assert isinstance(image_encoder_model, CLIPVisionModelWithProjection)
                     # Get image embeddings from CLIP and ImageProjModel.
                     image_prompt_embeds, uncond_image_prompt_embeds = ip_adapter_model.get_image_embeds(
@@ -621,7 +620,6 @@ def run_t2i_adapters(
         t2i_adapter_data = []
         for t2i_adapter_field in t2i_adapter:
             t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
-            t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
             image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")
 
             # The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
@@ -637,7 +635,7 @@ def run_t2i_adapters(
                 raise ValueError(f"Unexpected T2I-Adapter base model type: '{t2i_adapter_model_config.base}'.")
 
             t2i_adapter_model: T2IAdapter
-            with t2i_adapter_loaded_model as t2i_adapter_model:
+            with context.models.load(t2i_adapter_field.t2i_adapter_model) as t2i_adapter_model:
                 total_downscale_factor = t2i_adapter_model.total_downscale_factor
 
                 # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
@@ -926,10 +924,8 @@ def step_callback(state: PipelineIntermediateState) -> None:
             # ext: t2i/ip adapter
             ext_manager.run_callback(ExtensionCallbackType.SETUP, denoise_ctx)
 
-            unet_info = context.models.load(self.unet.unet)
-            assert isinstance(unet_info.model, UNet2DConditionModel)
             with (
-                unet_info.model_on_device() as (cached_weights, unet),
+                context.models.load(self.unet.unet).model_on_device() as (cached_weights, unet),
                 ModelPatcher.patch_unet_attention_processor(unet, denoise_ctx.inputs.attention_processor_cls),
                 # ext: controlnet
                 ext_manager.patch_extensions(denoise_ctx),
@@ -995,11 +991,9 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
                 del lora_info
             return
 
-        unet_info = context.models.load(self.unet.unet)
-        assert isinstance(unet_info.model, UNet2DConditionModel)
         with (
             ExitStack() as exit_stack,
-            unet_info.model_on_device() as (cached_weights, unet),
+            context.models.load(self.unet.unet).model_on_device() as (cached_weights, unet),
             ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
             SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
             # Apply the LoRA after unet has been moved to its target device for faster patching.
diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py
index d8bc8135bc7..d61b0c77c1c 100644
--- a/invokeai/app/invocations/flux_denoise.py
+++ b/invokeai/app/invocations/flux_denoise.py
@@ -199,8 +199,8 @@ def _run_diffusion(
             else None
         )
 
-        transformer_info = context.models.load(self.transformer.transformer)
-        is_schnell = "schnell" in getattr(transformer_info.config, "config_path", "")
+        transformer_config = context.models.get_config(self.transformer.transformer)
+        is_schnell = "schnell" in getattr(transformer_config, "config_path", "")
 
         # Calculate the timestep schedule.
         timesteps = get_schedule(
@@ -299,9 +299,11 @@ def _run_diffusion(
             )
 
             # Load the transformer model.
-            (cached_weights, transformer) = exit_stack.enter_context(transformer_info.model_on_device())
+            (cached_weights, transformer) = exit_stack.enter_context(
+                context.models.load(self.transformer.transformer).model_on_device()
+            )
             assert isinstance(transformer, Flux)
-            config = transformer_info.config
+            config = transformer_config
             assert config is not None
 
             # Determine if the model is quantized.
@@ -512,15 +514,18 @@ def _prep_controlnet_extensions(
         # before loading the models. Then make sure that all VAE encoding is done before loading the ControlNets to
         # minimize peak memory.
 
-        # First, load the ControlNet models so that we can determine the ControlNet types.
-        controlnet_models = [context.models.load(controlnet.control_model) for controlnet in controlnets]
-
         # Calculate the controlnet conditioning tensors.
         # We do this before loading the ControlNet models because it may require running the VAE, and we are trying to
         # keep peak memory down.
         controlnet_conds: list[torch.Tensor] = []
-        for controlnet, controlnet_model in zip(controlnets, controlnet_models, strict=True):
+        for controlnet in controlnets:
             image = context.images.get_pil(controlnet.image.image_name)
+
+            # HACK(ryand): We have to load the ControlNet model to determine whether the VAE needs to be run. We really
+            # shouldn't have to load the model here. There's a risk that the model will be dropped from the model cache
+            # before we load it into VRAM and thus we'll have to load it again (context:
+            # https://github.com/invoke-ai/InvokeAI/issues/7513).
+            controlnet_model = context.models.load(controlnet.control_model)
             if isinstance(controlnet_model.model, InstantXControlNetFlux):
                 if self.controlnet_vae is None:
                     raise ValueError("A ControlNet VAE is required when using an InstantX FLUX ControlNet.")
@@ -550,10 +555,8 @@ def _prep_controlnet_extensions(
 
         # Finally, load the ControlNet models and initialize the ControlNet extensions.
         controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension] = []
-        for controlnet, controlnet_cond, controlnet_model in zip(
-            controlnets, controlnet_conds, controlnet_models, strict=True
-        ):
-            model = exit_stack.enter_context(controlnet_model)
+        for controlnet, controlnet_cond in zip(controlnets, controlnet_conds, strict=True):
+            model = exit_stack.enter_context(context.models.load(controlnet.control_model))
 
             if isinstance(model, XLabsControlNetFlux):
                 controlnet_extensions.append(
diff --git a/invokeai/app/invocations/flux_text_encoder.py b/invokeai/app/invocations/flux_text_encoder.py
index 3f1f38c4a1f..3c49b6287b1 100644
--- a/invokeai/app/invocations/flux_text_encoder.py
+++ b/invokeai/app/invocations/flux_text_encoder.py
@@ -69,14 +69,11 @@ def invoke(self, context: InvocationContext) -> FluxConditioningOutput:
         )
 
     def _t5_encode(self, context: InvocationContext) -> torch.Tensor:
-        t5_tokenizer_info = context.models.load(self.t5_encoder.tokenizer)
-        t5_text_encoder_info = context.models.load(self.t5_encoder.text_encoder)
-
         prompt = [self.prompt]
 
         with (
-            t5_text_encoder_info as t5_text_encoder,
-            t5_tokenizer_info as t5_tokenizer,
+            context.models.load(self.t5_encoder.text_encoder) as t5_text_encoder,
+            context.models.load(self.t5_encoder.tokenizer) as t5_tokenizer,
         ):
             assert isinstance(t5_text_encoder, T5EncoderModel)
             assert isinstance(t5_tokenizer, T5Tokenizer)
@@ -90,22 +87,20 @@ def _t5_encode(self, context: InvocationContext) -> torch.Tensor:
         return prompt_embeds
 
     def _clip_encode(self, context: InvocationContext) -> torch.Tensor:
-        clip_tokenizer_info = context.models.load(self.clip.tokenizer)
-        clip_text_encoder_info = context.models.load(self.clip.text_encoder)
-
         prompt = [self.prompt]
 
+        clip_text_encoder_info = context.models.load(self.clip.text_encoder)
+        clip_text_encoder_config = clip_text_encoder_info.config
+        assert clip_text_encoder_config is not None
+
         with (
             clip_text_encoder_info.model_on_device() as (cached_weights, clip_text_encoder),
-            clip_tokenizer_info as clip_tokenizer,
+            context.models.load(self.clip.tokenizer) as clip_tokenizer,
             ExitStack() as exit_stack,
         ):
             assert isinstance(clip_text_encoder, CLIPTextModel)
             assert isinstance(clip_tokenizer, CLIPTokenizer)
 
-            clip_text_encoder_config = clip_text_encoder_info.config
-            assert clip_text_encoder_config is not None
-
             # Apply LoRA models to the CLIP encoder.
             # Note: We apply the LoRA after the transformer has been moved to its target device for faster patching.
             if clip_text_encoder_config.format in [ModelFormat.Diffusers]:
diff --git a/invokeai/app/invocations/flux_vae_decode.py b/invokeai/app/invocations/flux_vae_decode.py
index 362ce78de9d..902cdecfcd0 100644
--- a/invokeai/app/invocations/flux_vae_decode.py
+++ b/invokeai/app/invocations/flux_vae_decode.py
@@ -3,6 +3,7 @@
 from PIL import Image
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
     FieldDescriptions,
     Input,
@@ -24,7 +25,7 @@
     title="FLUX Latents to Image",
     tags=["latents", "image", "vae", "l2i", "flux"],
     category="latents",
-    version="1.0.0",
+    version="1.0.1",
 )
 class FluxVaeDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
     """Generates an image from latents."""
@@ -38,8 +39,23 @@ class FluxVaeDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
         input=Input.Connection,
     )
 
+    def _estimate_working_memory(self, latents: torch.Tensor, vae: AutoEncoder) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision).
+        out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+        out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+        element_size = next(vae.parameters()).element_size()
+        scaling_constant = 1090  # Determined experimentally.
+        working_memory = out_h * out_w * element_size * scaling_constant
+
+        # We add a 20% buffer to the working memory estimate to be safe.
+        working_memory = working_memory * 1.2
+        return int(working_memory)
+
     def _vae_decode(self, vae_info: LoadedModel, latents: torch.Tensor) -> Image.Image:
-        with vae_info as vae:
+        estimated_working_memory = self._estimate_working_memory(latents, vae_info.model)
+        with vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
             assert isinstance(vae, AutoEncoder)
             vae_dtype = next(iter(vae.parameters())).dtype
             latents = latents.to(device=TorchDevice.choose_torch_device(), dtype=vae_dtype)
diff --git a/invokeai/app/invocations/latents_to_image.py b/invokeai/app/invocations/latents_to_image.py
index 45e06a3f2ad..4942ca5da5c 100644
--- a/invokeai/app/invocations/latents_to_image.py
+++ b/invokeai/app/invocations/latents_to_image.py
@@ -34,7 +34,7 @@
     title="Latents to Image",
     tags=["latents", "image", "vae", "l2i"],
     category="latents",
-    version="1.3.0",
+    version="1.3.1",
 )
 class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
     """Generates an image from latents."""
@@ -53,13 +53,55 @@ class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
     tile_size: int = InputField(default=0, multiple_of=8, description=FieldDescriptions.vae_tile_size)
     fp32: bool = InputField(default=False, description=FieldDescriptions.fp32)
 
+    def _estimate_working_memory(
+        self, latents: torch.Tensor, use_tiling: bool, vae: AutoencoderKL | AutoencoderTiny
+    ) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision). This estimate is accurate for both SD1 and SDXL.
+        element_size = 4 if self.fp32 else 2
+        scaling_constant = 960  # Determined experimentally.
+
+        if use_tiling:
+            tile_size = self.tile_size
+            if tile_size == 0:
+                tile_size = vae.tile_sample_min_size
+                assert isinstance(tile_size, int)
+            out_h = tile_size
+            out_w = tile_size
+            working_memory = out_h * out_w * element_size * scaling_constant
+
+            # We add 25% to the working memory estimate when tiling is enabled to account for factors like tile overlap
+            # and number of tiles. We could make this more precise in the future, but this should be good enough for
+            # most use cases.
+            working_memory = working_memory * 1.25
+        else:
+            out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+            out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+            working_memory = out_h * out_w * element_size * scaling_constant
+
+        if self.fp32:
+            # If we are running in FP32, then we should account for the likely increase in model size (~250MB).
+            working_memory += 250 * 2**20
+
+        # We add 20% to the working memory estimate to be safe.
+        working_memory = int(working_memory * 1.2)
+        return working_memory
+
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> ImageOutput:
         latents = context.tensors.load(self.latents.latents_name)
 
+        use_tiling = self.tiled or context.config.get().force_tiled_decode
+
         vae_info = context.models.load(self.vae.vae)
         assert isinstance(vae_info.model, (AutoencoderKL, AutoencoderTiny))
-        with SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes), vae_info as vae:
+
+        estimated_working_memory = self._estimate_working_memory(latents, use_tiling, vae_info.model)
+        with (
+            SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes),
+            vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae),
+        ):
             context.util.signal_progress("Running VAE decoder")
             assert isinstance(vae, (AutoencoderKL, AutoencoderTiny))
             latents = latents.to(TorchDevice.choose_torch_device())
@@ -88,7 +130,7 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
                 vae.to(dtype=torch.float16)
                 latents = latents.half()
 
-            if self.tiled or context.config.get().force_tiled_decode:
+            if use_tiling:
                 vae.enable_tiling()
             else:
                 vae.disable_tiling()
diff --git a/invokeai/app/invocations/sd3_latents_to_image.py b/invokeai/app/invocations/sd3_latents_to_image.py
index 55cbddcc51e..945ab452112 100644
--- a/invokeai/app/invocations/sd3_latents_to_image.py
+++ b/invokeai/app/invocations/sd3_latents_to_image.py
@@ -6,6 +6,7 @@
 from PIL import Image
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
     FieldDescriptions,
     Input,
@@ -26,7 +27,7 @@
     title="SD3 Latents to Image",
     tags=["latents", "image", "vae", "l2i", "sd3"],
     category="latents",
-    version="1.3.0",
+    version="1.3.1",
 )
 class SD3LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
     """Generates an image from latents."""
@@ -40,13 +41,31 @@ class SD3LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
         input=Input.Connection,
     )
 
+    def _estimate_working_memory(self, latents: torch.Tensor, vae: AutoencoderKL) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision).
+        out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+        out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+        element_size = next(vae.parameters()).element_size()
+        scaling_constant = 1230  # Determined experimentally.
+        working_memory = out_h * out_w * element_size * scaling_constant
+
+        # We add a 20% buffer to the working memory estimate to be safe.
+        working_memory = working_memory * 1.2
+        return int(working_memory)
+
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> ImageOutput:
         latents = context.tensors.load(self.latents.latents_name)
 
         vae_info = context.models.load(self.vae.vae)
         assert isinstance(vae_info.model, (AutoencoderKL))
-        with SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes), vae_info as vae:
+        estimated_working_memory = self._estimate_working_memory(latents, vae_info.model)
+        with (
+            SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes),
+            vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae),
+        ):
             context.util.signal_progress("Running VAE")
             assert isinstance(vae, (AutoencoderKL))
             latents = latents.to(TorchDevice.choose_torch_device())
diff --git a/invokeai/app/invocations/sd3_text_encoder.py b/invokeai/app/invocations/sd3_text_encoder.py
index 0f83ca32188..301ab2ee6ba 100644
--- a/invokeai/app/invocations/sd3_text_encoder.py
+++ b/invokeai/app/invocations/sd3_text_encoder.py
@@ -87,14 +87,11 @@ def invoke(self, context: InvocationContext) -> SD3ConditioningOutput:
 
     def _t5_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Tensor:
         assert self.t5_encoder is not None
-        t5_tokenizer_info = context.models.load(self.t5_encoder.tokenizer)
-        t5_text_encoder_info = context.models.load(self.t5_encoder.text_encoder)
-
         prompt = [self.prompt]
 
         with (
-            t5_text_encoder_info as t5_text_encoder,
-            t5_tokenizer_info as t5_tokenizer,
+            context.models.load(self.t5_encoder.text_encoder) as t5_text_encoder,
+            context.models.load(self.t5_encoder.tokenizer) as t5_tokenizer,
         ):
             context.util.signal_progress("Running T5 encoder")
             assert isinstance(t5_text_encoder, T5EncoderModel)
@@ -129,14 +126,12 @@ def _t5_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Tens
     def _clip_encode(
         self, context: InvocationContext, clip_model: CLIPField, tokenizer_max_length: int = 77
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        clip_tokenizer_info = context.models.load(clip_model.tokenizer)
-        clip_text_encoder_info = context.models.load(clip_model.text_encoder)
-
         prompt = [self.prompt]
 
+        clip_text_encoder_info = context.models.load(clip_model.text_encoder)
         with (
             clip_text_encoder_info.model_on_device() as (cached_weights, clip_text_encoder),
-            clip_tokenizer_info as clip_tokenizer,
+            context.models.load(clip_model.tokenizer) as clip_tokenizer,
             ExitStack() as exit_stack,
         ):
             context.util.signal_progress("Running CLIP encoder")
diff --git a/invokeai/app/invocations/spandrel_image_to_image.py b/invokeai/app/invocations/spandrel_image_to_image.py
index 0aa6dd33466..0f34bec7771 100644
--- a/invokeai/app/invocations/spandrel_image_to_image.py
+++ b/invokeai/app/invocations/spandrel_image_to_image.py
@@ -157,9 +157,6 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
         # revisit this.
         image = context.images.get_pil(self.image.image_name, mode="RGB")
 
-        # Load the model.
-        spandrel_model_info = context.models.load(self.image_to_image_model)
-
         def step_callback(step: int, total_steps: int) -> None:
             context.util.signal_progress(
                 message=f"Processing tile {step}/{total_steps}",
@@ -167,7 +164,7 @@ def step_callback(step: int, total_steps: int) -> None:
             )
 
         # Do the upscaling.
-        with spandrel_model_info as spandrel_model:
+        with context.models.load(self.image_to_image_model) as spandrel_model:
             assert isinstance(spandrel_model, SpandrelImageToImageModel)
 
             # Upscale the image
@@ -206,9 +203,6 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
         # revisit this.
         image = context.images.get_pil(self.image.image_name, mode="RGB")
 
-        # Load the model.
-        spandrel_model_info = context.models.load(self.image_to_image_model)
-
         # The target size of the image, determined by the provided scale. We'll run the upscaler until we hit this size.
         # Later, we may mutate this value if the model doesn't upscale the image or if the user requested a multiple of 8.
         target_width = int(image.width * self.scale)
@@ -221,7 +215,7 @@ def step_callback(iteration: int, step: int, total_steps: int) -> None:
             )
 
         # Do the upscaling.
-        with spandrel_model_info as spandrel_model:
+        with context.models.load(self.image_to_image_model) as spandrel_model:
             assert isinstance(spandrel_model, SpandrelImageToImageModel)
 
             iteration = 1
diff --git a/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py b/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
index 7c1442177f0..03ad4740a06 100644
--- a/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
+++ b/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
@@ -201,12 +201,9 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
                 yield (lora_info.model, lora.weight)
                 del lora_info
 
-        # Load the UNet model.
-        unet_info = context.models.load(self.unet.unet)
-
         with (
             ExitStack() as exit_stack,
-            unet_info as unet,
+            context.models.load(self.unet.unet) as unet,
             LayerPatcher.apply_smart_model_patches(
                 model=unet, patches=_lora_loader(), prefix="lora_unet_", dtype=unet.dtype
             ),
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 52653de0f4c..40d8628a379 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -13,7 +13,6 @@
 from pathlib import Path
 from typing import Any, Literal, Optional
 
-import psutil
 import yaml
 from pydantic import BaseModel, Field, PrivateAttr, field_validator
 from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
@@ -25,8 +24,6 @@
 INIT_FILE = Path("invokeai.yaml")
 DB_FILE = Path("invokeai.db")
 LEGACY_INIT_FILE = Path("invokeai.init")
-DEFAULT_RAM_CACHE = 10.0
-DEFAULT_VRAM_CACHE = 0.25
 DEVICE = Literal["auto", "cpu", "cuda", "cuda:1", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32"]
 ATTENTION_TYPE = Literal["auto", "normal", "xformers", "sliced", "torch-sdp"]
@@ -36,24 +33,6 @@
 CONFIG_SCHEMA_VERSION = "4.0.2"
 
 
-def get_default_ram_cache_size() -> float:
-    """Run a heuristic for the default RAM cache based on installed RAM."""
-
-    # On some machines, psutil.virtual_memory().total gives a value that is slightly less than the actual RAM, so the
-    # limits are set slightly lower than than what we expect the actual RAM to be.
-
-    GB = 1024**3
-    max_ram = psutil.virtual_memory().total / GB
-
-    if max_ram >= 60:
-        return 15.0
-    if max_ram >= 30:
-        return 7.5
-    if max_ram >= 14:
-        return 4.0
-    return 2.1  # 2.1 is just large enough for sd 1.5 ;-)
-
-
 class URLRegexTokenPair(BaseModel):
     url_regex: str = Field(description="Regular expression to match against the URL")
     token: str = Field(description="Token to use when the URL matches the regex")
@@ -103,11 +82,12 @@ class InvokeAIAppConfig(BaseSettings):
         profile_graphs: Enable graph profiling using `cProfile`.
         profile_prefix: An optional prefix for profile output files.
         profiles_dir: Path to profiles output directory.
-        ram: Maximum memory amount used by memory model cache for rapid switching (GB).
-        vram: Amount of VRAM reserved for model storage (GB).
-        lazy_offload: Keep models in VRAM until their space is needed.
+        ram: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.
+        vram: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.
+        lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behaviour is out of beta.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
-        enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.
+        device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
+        enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
         precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
         sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
@@ -175,11 +155,12 @@ class InvokeAIAppConfig(BaseSettings):
     profiles_dir:                  Path = Field(default=Path("profiles"),   description="Path to profiles output directory.")
 
     # CACHE
-    ram:                           float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
-    vram:                          float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
-    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
+    ram:                Optional[float] = Field(default=None, gt=0,         description="The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.")
+    vram:               Optional[float] = Field(default=None, ge=0,         description="The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.")
+    lazy_offload:                  bool = Field(default=True,               description="DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behaviour is out of beta.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
-    enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.")
+    device_working_mem_gb:        float = Field(default=3,                  description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
+    enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
 
     # DEVICE
     device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index c7bcd43d7a7..dc6c5811224 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -82,11 +82,12 @@ def build_model_manager(
         logger.setLevel(app_config.log_level.upper())
 
         ram_cache = ModelCache(
+            execution_device_working_mem_gb=app_config.device_working_mem_gb,
+            enable_partial_loading=app_config.enable_partial_loading,
             max_ram_cache_size_gb=app_config.ram,
             max_vram_cache_size_gb=app_config.vram,
-            enable_partial_loading=app_config.enable_partial_loading,
-            logger=logger,
             execution_device=execution_device or TorchDevice.choose_torch_device(),
+            logger=logger,
         )
         loader = ModelLoadService(
             app_config=app_config,
diff --git a/invokeai/backend/model_manager/load/load_base.py b/invokeai/backend/model_manager/load/load_base.py
index 1bf24edeed9..94206c8f91a 100644
--- a/invokeai/backend/model_manager/load/load_base.py
+++ b/invokeai/backend/model_manager/load/load_base.py
@@ -57,16 +57,22 @@ def __init__(self, cache_record: CacheRecord, cache: ModelCache):
         self._cache = cache
 
     def __enter__(self) -> AnyModel:
-        self._cache.lock(self._cache_record)
+        self._cache.lock(self._cache_record, None)
         return self.model
 
     def __exit__(self, *args: Any, **kwargs: Any) -> None:
         self._cache.unlock(self._cache_record)
 
     @contextmanager
-    def model_on_device(self) -> Generator[Tuple[Optional[Dict[str, torch.Tensor]], AnyModel], None, None]:
-        """Return a tuple consisting of the model's state dict (if it exists) and the locked model on execution device."""
-        self._cache.lock(self._cache_record)
+    def model_on_device(
+        self, working_mem_bytes: Optional[int] = None
+    ) -> Generator[Tuple[Optional[Dict[str, torch.Tensor]], AnyModel], None, None]:
+        """Return a tuple consisting of the model's state dict (if it exists) and the locked model on execution device.
+
+        :param working_mem_bytes: The amount of working memory to keep available on the compute device when loading the
+            model.
+        """
+        self._cache.lock(self._cache_record, working_mem_bytes)
         try:
             yield (self._cache_record.cached_model.get_cpu_state_dict(), self._cache_record.cached_model.model)
         finally:
diff --git a/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py b/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
index cecf7fb20d9..3c069c975d9 100644
--- a/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
+++ b/invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
@@ -166,13 +166,17 @@ def partial_load_to_vram(self, vram_bytes_to_load: int) -> int:
         return vram_bytes_loaded
 
     @torch.no_grad()
-    def partial_unload_from_vram(self, vram_bytes_to_free: int) -> int:
+    def partial_unload_from_vram(self, vram_bytes_to_free: int, keep_required_weights_in_vram: bool = False) -> int:
         """Unload weights from VRAM until vram_bytes_to_free bytes are freed. Or the entire model is unloaded.
 
+        :param keep_required_weights_in_vram: If True, any weights that must be kept in VRAM to run the model will be
+            kept in VRAM.
+
         Returns:
             The number of bytes unloaded from VRAM.
         """
         vram_bytes_freed = 0
+        required_weights_in_vram = 0
 
         offload_device = "cpu"
         cur_state_dict = self._model.state_dict()
@@ -183,6 +187,10 @@ def partial_unload_from_vram(self, vram_bytes_to_free: int) -> int:
             if param.device.type == offload_device:
                 continue
 
+            if keep_required_weights_in_vram and key in self._keys_in_modules_that_do_not_support_autocast:
+                required_weights_in_vram += self._state_dict_bytes[key]
+                continue
+
             cur_state_dict[key] = self._cpu_state_dict[key]
             vram_bytes_freed += self._state_dict_bytes[key]
 
diff --git a/invokeai/backend/model_manager/load/model_cache/dev_utils.py b/invokeai/backend/model_manager/load/model_cache/dev_utils.py
new file mode 100644
index 00000000000..4e1bac68917
--- /dev/null
+++ b/invokeai/backend/model_manager/load/model_cache/dev_utils.py
@@ -0,0 +1,33 @@
+from contextlib import contextmanager
+
+import torch
+
+from invokeai.backend.util.logging import InvokeAILogger
+
+
+@contextmanager
+def log_operation_vram_usage(operation_name: str):
+    """A helper function for tuning working memory requirements for memory-intensive ops.
+
+    Sample usage:
+
+    ```python
+    with log_operation_vram_usage("some_operation"):
+        some_operation()
+    ```
+    """
+    torch.cuda.synchronize()
+    torch.cuda.reset_peak_memory_stats()
+    max_allocated_before = torch.cuda.max_memory_allocated()
+    max_reserved_before = torch.cuda.max_memory_reserved()
+    try:
+        yield
+    finally:
+        torch.cuda.synchronize()
+        max_allocated_after = torch.cuda.max_memory_allocated()
+        max_reserved_after = torch.cuda.max_memory_reserved()
+        logger = InvokeAILogger.get_logger()
+        logger.info(
+            f">>>{operation_name} Peak VRAM allocated: {(max_allocated_after - max_allocated_before) / 2**20} MB, "
+            f"Peak VRAM reserved: {(max_reserved_after - max_reserved_before) / 2**20} MB"
+        )
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
index 98462a54c53..bf51b974ce3 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -4,6 +4,7 @@
 from logging import Logger
 from typing import Dict, List, Optional
 
+import psutil
 import torch
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
@@ -75,9 +76,10 @@ class ModelCache:
 
     def __init__(
         self,
-        max_ram_cache_size_gb: float,
-        max_vram_cache_size_gb: float,
+        execution_device_working_mem_gb: float,
         enable_partial_loading: bool,
+        max_ram_cache_size_gb: float | None = None,
+        max_vram_cache_size_gb: float | None = None,
         execution_device: torch.device | str = "cuda",
         storage_device: torch.device | str = "cpu",
         log_memory_usage: bool = False,
@@ -85,6 +87,9 @@ def __init__(
     ):
         """Initialize the model RAM cache.
 
+        :param execution_device_working_mem_gb: The amount of working memory to keep on the GPU (in GB) i.e. non-model
+            VRAM.
+        :param enable_partial_loading: Whether to enable partial loading of models.
         :param max_ram_cache_size_gb: The maximum amount of CPU RAM to use for model caching in GB. This parameter is
             kept to maintain compatibility with previous versions of the model cache, but should be deprecated in the
             future. If set, this parameter overrides the default cache size logic.
@@ -99,12 +104,13 @@ def __init__(
             behaviour.
         :param logger: InvokeAILogger to use (otherwise creates one)
         """
+        self._enable_partial_loading = enable_partial_loading
+        self._execution_device_working_mem_gb = execution_device_working_mem_gb
         self._execution_device: torch.device = torch.device(execution_device)
         self._storage_device: torch.device = torch.device(storage_device)
 
         self._max_ram_cache_size_gb = max_ram_cache_size_gb
         self._max_vram_cache_size_gb = max_vram_cache_size_gb
-        self._enable_partial_loading = enable_partial_loading
 
         self._logger = PrefixedLoggerAdapter(
             logger or InvokeAILogger.get_logger(self.__class__.__name__), "MODEL CACHE"
@@ -194,7 +200,7 @@ def get(self, key: str, stats_name: Optional[str] = None) -> CacheRecord:
         self._logger.debug(f"Cache hit: {key} (Type: {cache_entry.cached_model.model.__class__.__name__})")
         return cache_entry
 
-    def lock(self, cache_entry: CacheRecord) -> None:
+    def lock(self, cache_entry: CacheRecord, working_mem_bytes: Optional[int]) -> None:
         """Lock a model for use and move it into VRAM."""
         if cache_entry.key not in self._cached_models:
             self._logger.info(
@@ -215,7 +221,7 @@ def lock(self, cache_entry: CacheRecord) -> None:
             return
 
         try:
-            self._load_locked_model(cache_entry)
+            self._load_locked_model(cache_entry, working_mem_bytes)
             self._logger.debug(
                 f"Finished locking model {cache_entry.key} (Type: {cache_entry.cached_model.model.__class__.__name__})"
             )
@@ -244,7 +250,7 @@ def unlock(self, cache_entry: CacheRecord) -> None:
             f"Unlocked model {cache_entry.key} (Type: {cache_entry.cached_model.model.__class__.__name__})"
         )
 
-    def _load_locked_model(self, cache_entry: CacheRecord) -> None:
+    def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Optional[int] = None) -> None:
         """Helper function for self.lock(). Loads a locked model into VRAM."""
         start_time = time.time()
 
@@ -254,7 +260,7 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None:
         model_total_bytes = cache_entry.cached_model.total_bytes()
         model_vram_needed = model_total_bytes - model_cur_vram_bytes
 
-        vram_available = self._get_vram_available()
+        vram_available = self._get_vram_available(working_mem_bytes)
         self._logger.debug(
             f"Before unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
         )
@@ -263,15 +269,24 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None:
         # 1. If the model can fit entirely in VRAM, then make enough room for it to be loaded fully.
         # 2. If the model can't fit fully into VRAM, then unload all other models and load as much of the model as
         #    possible.
-        vram_bytes_freed = self._offload_unlocked_models(model_vram_needed)
+        vram_bytes_freed = self._offload_unlocked_models(model_vram_needed, working_mem_bytes)
         self._logger.debug(f"Unloaded models (if necessary): vram_bytes_freed={(vram_bytes_freed/MB):.2f}MB")
 
         # Check the updated vram_available after offloading.
-        vram_available = self._get_vram_available()
+        vram_available = self._get_vram_available(working_mem_bytes)
         self._logger.debug(
             f"After unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
         )
 
+        if vram_available < 0:
+            # There is insufficient VRAM available. As a last resort, try to unload the model being locked from VRAM,
+            # as it may still be loaded from a previous use.
+            vram_bytes_freed_from_own_model = self._move_model_to_ram(cache_entry, -vram_available)
+            vram_available = self._get_vram_available(working_mem_bytes)
+            self._logger.debug(
+                f"Unloaded {vram_bytes_freed_from_own_model/MB:.2f}MB from the model being locked ({cache_entry.key})."
+            )
+
         # Move as much of the model as possible into VRAM.
         # For testing, only allow 10% of the model to be loaded into VRAM.
         # vram_available = int(model_vram_needed * 0.1)
@@ -280,7 +295,7 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None:
         model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available + MB)
 
         model_cur_vram_bytes = cache_entry.cached_model.cur_vram_bytes()
-        vram_available = self._get_vram_available()
+        vram_available = self._get_vram_available(working_mem_bytes)
         loaded_percent = model_cur_vram_bytes / model_total_bytes if model_total_bytes > 0 else 0
         self._logger.info(
             f"Loaded model '{cache_entry.key}' ({cache_entry.cached_model.model.__class__.__name__}) onto "
@@ -298,7 +313,7 @@ def _move_model_to_vram(self, cache_entry: CacheRecord, vram_available: int) ->
             if isinstance(cache_entry.cached_model, CachedModelWithPartialLoad):
                 return cache_entry.cached_model.partial_load_to_vram(vram_available)
             elif isinstance(cache_entry.cached_model, CachedModelOnlyFullLoad):  # type: ignore
-                # Partial load is not supported, so we have no choice but to try and fit it all into VRAM.
+                # Partial load is not supported, so we have not choice but to try and fit it all into VRAM.
                 return cache_entry.cached_model.full_load_to_vram()
             else:
                 raise ValueError(f"Unsupported cached model type: {type(cache_entry.cached_model)}")
@@ -312,7 +327,9 @@ def _move_model_to_vram(self, cache_entry: CacheRecord, vram_available: int) ->
     def _move_model_to_ram(self, cache_entry: CacheRecord, vram_bytes_to_free: int) -> int:
         try:
             if isinstance(cache_entry.cached_model, CachedModelWithPartialLoad):
-                return cache_entry.cached_model.partial_unload_from_vram(vram_bytes_to_free)
+                return cache_entry.cached_model.partial_unload_from_vram(
+                    vram_bytes_to_free, keep_required_weights_in_vram=cache_entry.is_locked
+                )
             elif isinstance(cache_entry.cached_model, CachedModelOnlyFullLoad):  # type: ignore
                 return cache_entry.cached_model.full_unload_from_vram()
             else:
@@ -322,10 +339,37 @@ def _move_model_to_ram(self, cache_entry: CacheRecord, vram_bytes_to_free: int)
             self._delete_cache_entry(cache_entry)
             raise
 
-    def _get_vram_available(self) -> int:
-        """Calculate the amount of additional VRAM available for the cache to use."""
-        vram_total_available_to_cache = int(self._max_vram_cache_size_gb * GB)
-        return vram_total_available_to_cache - self._get_vram_in_use()
+    def _get_vram_available(self, working_mem_bytes: Optional[int]) -> int:
+        """Calculate the amount of additional VRAM available for the cache to use (takes into account the working
+        memory).
+        """
+        # If self._max_vram_cache_size_gb is set, then it overrides the default logic.
+        if self._max_vram_cache_size_gb is not None:
+            vram_total_available_to_cache = int(self._max_vram_cache_size_gb * GB)
+            return vram_total_available_to_cache - self._get_vram_in_use()
+
+        working_mem_bytes_default = int(self._execution_device_working_mem_gb * GB)
+        working_mem_bytes = max(working_mem_bytes or working_mem_bytes_default, working_mem_bytes_default)
+
+        if self._execution_device.type == "cuda":
+            # TODO(ryand): It is debatable whether we should use memory_reserved() or memory_allocated() here.
+            # memory_reserved() includes memory reserved by the torch CUDA memory allocator that may or may not be
+            # re-used for future allocations. For now, we use memory_allocated() to be conservative.
+            # vram_reserved = torch.cuda.memory_reserved(self._execution_device)
+            vram_allocated = torch.cuda.memory_allocated(self._execution_device)
+            vram_free, _vram_total = torch.cuda.mem_get_info(self._execution_device)
+            vram_available_to_process = vram_free + vram_allocated
+        elif self._execution_device.type == "mps":
+            vram_reserved = torch.mps.driver_allocated_memory()
+            # TODO(ryand): Is it accurate that MPS shares memory with the CPU?
+            vram_free = psutil.virtual_memory().available
+            vram_available_to_process = vram_free + vram_reserved
+        else:
+            raise ValueError(f"Unsupported execution device: {self._execution_device.type}")
+
+        vram_total_available_to_cache = vram_available_to_process - working_mem_bytes
+        vram_cur_available_to_cache = vram_total_available_to_cache - self._get_vram_in_use()
+        return vram_cur_available_to_cache
 
     def _get_vram_in_use(self) -> int:
         """Get the amount of VRAM currently in use by the cache."""
@@ -340,9 +384,34 @@ def _get_vram_in_use(self) -> int:
 
     def _get_ram_available(self) -> int:
         """Get the amount of RAM available for the cache to use, while keeping memory pressure under control."""
-
-        ram_total_available_to_cache = int(self._max_ram_cache_size_gb * GB)
-        return ram_total_available_to_cache - self._get_ram_in_use()
+        # If self._max_ram_cache_size_gb is set, then it overrides the default logic.
+        if self._max_ram_cache_size_gb is not None:
+            ram_total_available_to_cache = int(self._max_ram_cache_size_gb * GB)
+            return ram_total_available_to_cache - self._get_ram_in_use()
+
+        virtual_memory = psutil.virtual_memory()
+        ram_total = virtual_memory.total
+        ram_available = virtual_memory.available
+        ram_used = ram_total - ram_available
+
+        # The total size of all the models in the cache will often be larger than the amount of RAM reported by psutil
+        # (due to lazy-loading and OS RAM caching behaviour). We could just rely on the psutil values, but it feels
+        # like a bad idea to over-fill the model cache. So, for now, we'll try to keep the total size of models in the
+        # cache under the total amount of system RAM.
+        cache_ram_used = self._get_ram_in_use()
+        ram_used = max(cache_ram_used, ram_used)
+
+        # Aim to keep 10% of RAM free.
+        ram_available_based_on_memory_usage = int(ram_total * 0.9) - ram_used
+
+        # If we are running out of RAM, then there's an increased likelihood that we will run into this issue:
+        # https://github.com/invoke-ai/InvokeAI/issues/7513
+        # To keep things running smoothly, there's a minimum RAM cache size that we always allow (even if this means
+        # using swap).
+        min_ram_cache_size_bytes = 4 * GB
+        ram_available_based_on_min_cache_size = min_ram_cache_size_bytes - cache_ram_used
+
+        return max(ram_available_based_on_memory_usage, ram_available_based_on_min_cache_size)
 
     def _get_ram_in_use(self) -> int:
         """Get the amount of RAM currently in use."""
@@ -363,7 +432,7 @@ def _get_vram_state_str(self, model_cur_vram_bytes: int, model_total_bytes: int,
             + f"vram_available={(vram_available/MB):.0f} MB, "
         )
 
-    def _offload_unlocked_models(self, vram_bytes_required: int) -> int:
+    def _offload_unlocked_models(self, vram_bytes_required: int, working_mem_bytes: Optional[int] = None) -> int:
         """Offload models from the execution_device until vram_bytes_required bytes are available, or all models are
         offloaded. Of course, locked models are not offloaded.
 
@@ -378,11 +447,13 @@ def _offload_unlocked_models(self, vram_bytes_required: int) -> int:
         cache_entries_increasing_size = sorted(self._cached_models.values(), key=lambda x: x.cached_model.total_bytes())
         for cache_entry in cache_entries_increasing_size:
             # We do not fully trust the count of bytes freed, so we check again on each iteration.
-            vram_available = self._get_vram_available()
+            vram_available = self._get_vram_available(working_mem_bytes)
             vram_bytes_to_free = vram_bytes_required - vram_available
             if vram_bytes_to_free <= 0:
                 break
             if cache_entry.is_locked:
+                # TODO(ryand): In the future, we may want to partially unload locked models, but this requires careful
+                # handling of model patches (e.g. LoRA).
                 continue
             cache_entry_bytes_freed = self._move_model_to_ram(cache_entry, vram_bytes_to_free)
             if cache_entry_bytes_freed > 0:
@@ -420,7 +491,7 @@ def _log_cache_state(self, title: str = "Model cache state:", include_entry_deta
 
         if self._execution_device.type != "cpu":
             vram_in_use_bytes = self._get_vram_in_use()
-            vram_available_bytes = self._get_vram_available()
+            vram_available_bytes = self._get_vram_available(None)
             vram_size_bytes = vram_in_use_bytes + vram_available_bytes
             vram_in_use_bytes_percent = vram_in_use_bytes / vram_size_bytes if vram_size_bytes > 0 else 0
             vram_available_bytes_percent = vram_available_bytes / vram_size_bytes if vram_size_bytes > 0 else 0
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
index 0ab9c4d13e7..98a022a28a5 100644
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@@ -300,30 +300,6 @@ export type paths = {
         patch?: never;
         trace?: never;
     };
-    "/api/v2/models/model_cache": {
-        parameters: {
-            query?: never;
-            header?: never;
-            path?: never;
-            cookie?: never;
-        };
-        /**
-         * Get maximum size of model manager RAM or VRAM cache.
-         * @description Return the current RAM or VRAM cache size setting (in GB).
-         */
-        get: operations["get_cache_size"];
-        /**
-         * Set maximum size of model manager RAM or VRAM cache, optionally writing new value out to invokeai.yaml config file.
-         * @description Set the current RAM or VRAM cache size setting (in GB). .
-         */
-        put: operations["set_cache_size"];
-        post?: never;
-        delete?: never;
-        options?: never;
-        head?: never;
-        patch?: never;
-        trace?: never;
-    };
     "/api/v2/models/stats": {
         parameters: {
             query?: never;
@@ -3126,12 +3102,6 @@ export type components = {
                 [key: string]: number;
             };
         };
-        /**
-         * CacheType
-         * @description Cache type - one of vram or ram.
-         * @enum {string}
-         */
-        CacheType: "RAM" | "VRAM";
         /**
          * Calculate Image Tiles Even Split
          * @description Calculate the coordinates and overlaps of tiles that cover a target image shape.
@@ -19766,74 +19736,6 @@ export interface operations {
             };
         };
     };
-    get_cache_size: {
-        parameters: {
-            query?: {
-                /** @description The cache type */
-                cache_type?: components["schemas"]["CacheType"];
-            };
-            header?: never;
-            path?: never;
-            cookie?: never;
-        };
-        requestBody?: never;
-        responses: {
-            /** @description Successful Response */
-            200: {
-                headers: {
-                    [name: string]: unknown;
-                };
-                content: {
-                    "application/json": number;
-                };
-            };
-            /** @description Validation Error */
-            422: {
-                headers: {
-                    [name: string]: unknown;
-                };
-                content: {
-                    "application/json": components["schemas"]["HTTPValidationError"];
-                };
-            };
-        };
-    };
-    set_cache_size: {
-        parameters: {
-            query: {
-                /** @description The new value for the maximum cache size */
-                value: number;
-                /** @description The cache type */
-                cache_type?: components["schemas"]["CacheType"];
-                /** @description Write new value out to invokeai.yaml */
-                persist?: boolean;
-            };
-            header?: never;
-            path?: never;
-            cookie?: never;
-        };
-        requestBody?: never;
-        responses: {
-            /** @description Successful Response */
-            200: {
-                headers: {
-                    [name: string]: unknown;
-                };
-                content: {
-                    "application/json": number;
-                };
-            };
-            /** @description Validation Error */
-            422: {
-                headers: {
-                    [name: string]: unknown;
-                };
-                content: {
-                    "application/json": components["schemas"]["HTTPValidationError"];
-                };
-            };
-        };
-    };
     get_stats: {
         parameters: {
             query?: never;
diff --git a/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py b/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py
index 4fae046cf88..a3a1537c3dd 100644
--- a/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py
+++ b/tests/backend/model_manager/load/model_cache/cached_model/test_cached_model_with_partial_load.py
@@ -98,6 +98,37 @@ def test_cached_model_partial_unload(device: str, model: DummyModule):
     assert model.linear2.is_device_autocasting_enabled()
 
 
+@parameterize_mps_and_cuda
+def test_cached_model_partial_unload_keep_required_weights_in_vram(device: str, model: DummyModule):
+    # Model starts in CPU memory.
+    cached_model = CachedModelWithPartialLoad(model=model, compute_device=torch.device(device))
+    model_total_bytes = cached_model.total_bytes()
+    assert cached_model.cur_vram_bytes() == 0
+
+    # Full load the model into VRAM.
+    cached_model.full_load_to_vram()
+    assert cached_model.cur_vram_bytes() == model_total_bytes
+
+    # Partially unload the model from VRAM, but request the required weights to be kept in VRAM.
+    bytes_to_free = int(model_total_bytes)
+    freed_bytes = cached_model.partial_unload_from_vram(bytes_to_free, keep_required_weights_in_vram=True)
+
+    # Check that the model is partially unloaded from VRAM.
+    assert freed_bytes < model_total_bytes
+    assert freed_bytes == model_total_bytes - cached_model.cur_vram_bytes()
+    assert freed_bytes == sum(
+        calc_tensor_size(p) for p in itertools.chain(model.parameters(), model.buffers()) if p.device.type == "cpu"
+    )
+    # The parameters should be offloaded to the CPU, because they are in Linear layers.
+    assert all(p.device.type == "cpu" for p in model.parameters())
+    # The buffer should still be on the device, because it is in a layer that does not support autocast.
+    assert all(p.device.type == device for p in model.buffers())
+
+    # Check that the model's modules still have device autocasting enabled.
+    assert model.linear1.is_device_autocasting_enabled()
+    assert model.linear2.is_device_autocasting_enabled()
+
+
 @parameterize_mps_and_cuda
 def test_cached_model_full_load_and_unload(device: str, model: DummyModule):
     cached_model = CachedModelWithPartialLoad(model=model, compute_device=torch.device(device))
diff --git a/tests/backend/model_manager/model_manager_fixtures.py b/tests/backend/model_manager/model_manager_fixtures.py
index 4449bbaf62f..0b723f211c3 100644
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -26,6 +26,7 @@
     VAEDiffusersConfig,
 )
 from invokeai.backend.model_manager.load.model_cache.model_cache import ModelCache
+from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 from tests.backend.model_manager.model_metadata.metadata_examples import (
     HFTestLoraMetadata,
@@ -91,10 +92,12 @@ def mm2_download_queue(mm2_session: Session) -> DownloadQueueServiceBase:
 @pytest.fixture
 def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
     ram_cache = ModelCache(
-        logger=InvokeAILogger.get_logger(),
+        execution_device_working_mem_gb=mm2_app_config.device_working_mem_gb,
+        enable_partial_loading=mm2_app_config.enable_partial_loading,
         max_ram_cache_size_gb=mm2_app_config.ram,
         max_vram_cache_size_gb=mm2_app_config.vram,
-        enable_partial_loading=mm2_app_config.enable_partial_loading,
+        execution_device=TorchDevice.choose_torch_device(),
+        logger=InvokeAILogger.get_logger(),
     )
     return ModelLoadService(
         app_config=mm2_app_config,