added management to loaded models to prevent overloading GPU and keep things as fast as possible.

2025-05-20 17:13:59 -07:00 · 2025-05-20 17:13:59 -07:00 · 3278581bf7
commit 3278581bf7
parent 11c9f07957
1 changed files with 2 additions and 0 deletions
--- a/setup-ai-stack.sh
+++ b/setup-ai-stack.sh
@ -206,6 +206,7 @@ if [[ "$1" != "--update" ]]; then
      docker run -d -p 3000:8080 --gpus all \
        -e OPENAI_API_BASE_URL=http://pipelines:9099 \
        -e OPENAI_API_KEY=0p3n-w3bu! \
+        -e OLLAMA_MAX_LOADED_MODELS=1
        -v ollama:/root/.ollama \
        -v open-webui:/app/backend/data \
        --name open-webui \
@ -476,6 +477,7 @@ if [[ "$1" != "--update" ]]; then
        -e ENABLE_BACKENDS=llama-cuda,ollama \
        -e INCLUDE_DEFAULT_MODELS=true \
        -e AUTOLOAD_MODELS=true \
+        -e MODEL_IDLE_TIMEOUT=600
        --restart unless-stopped \
        localai/localai:latest-aio-gpu-nvidia-cuda-12