added management to loaded models to prevent overloading GPU and keep things as fast as possible.

This commit is contained in:
crowetic 2025-05-20 17:13:59 -07:00
parent 11c9f07957
commit 3278581bf7

View File

@ -206,6 +206,7 @@ if [[ "$1" != "--update" ]]; then
docker run -d -p 3000:8080 --gpus all \
-e OPENAI_API_BASE_URL=http://pipelines:9099 \
-e OPENAI_API_KEY=0p3n-w3bu! \
-e OLLAMA_MAX_LOADED_MODELS=1
-v ollama:/root/.ollama \
-v open-webui:/app/backend/data \
--name open-webui \
@ -476,6 +477,7 @@ if [[ "$1" != "--update" ]]; then
-e ENABLE_BACKENDS=llama-cuda,ollama \
-e INCLUDE_DEFAULT_MODELS=true \
-e AUTOLOAD_MODELS=true \
-e MODEL_IDLE_TIMEOUT=600
--restart unless-stopped \
localai/localai:latest-aio-gpu-nvidia-cuda-12