From 3278581bf744419f30138a1d0c203cf38466aca9 Mon Sep 17 00:00:00 2001
From: crowetic <jason@crowetic.com>
Date: Tue, 20 May 2025 17:13:59 -0700
Subject: [PATCH] added management to loaded models to prevent overloading GPU
 and keep things as fast as possible.

---
 setup-ai-stack.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup-ai-stack.sh b/setup-ai-stack.sh
index 1026ae0..8a2b04d 100644
--- a/setup-ai-stack.sh
+++ b/setup-ai-stack.sh
@@ -206,6 +206,7 @@ if [[ "$1" != "--update" ]]; then
       docker run -d -p 3000:8080 --gpus all \
         -e OPENAI_API_BASE_URL=http://pipelines:9099 \
         -e OPENAI_API_KEY=0p3n-w3bu! \
+        -e OLLAMA_MAX_LOADED_MODELS=1
         -v ollama:/root/.ollama \
         -v open-webui:/app/backend/data \
         --name open-webui \
@@ -476,6 +477,7 @@ if [[ "$1" != "--update" ]]; then
         -e ENABLE_BACKENDS=llama-cuda,ollama \
         -e INCLUDE_DEFAULT_MODELS=true \
         -e AUTOLOAD_MODELS=true \
+        -e MODEL_IDLE_TIMEOUT=600
         --restart unless-stopped \
         localai/localai:latest-aio-gpu-nvidia-cuda-12