#!/bin/bash -x
# Variables
EMBEDDING_MODEL="/mnt/nvme0n1/LLM/nomic-ai/quantized/nomic-embed-text-v1.5-Q8_0.gguf"
LLAMA_SERVER="/usr/local/bin/llama-server"
LOG_EMBEDDINGS="$HOME/tmp/llm-embeddings.log"
EMBEDDING_PORT=9999
# Get the host interface
HOST=$(/home/data1/protected/bin/rcd/rcd-get-ethernet-interface.sh)
# Check if the embedding port is already listening
if ! ss -tuln | grep -q ":$EMBEDDING_PORT "; then
# Kill any running instance of llama-server for embeddings
pkill -f "llama-server --port $EMBEDDING_PORT"
# Check if free GPU memory is at least 600 MB
FREE_GPU_MEMORY=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader | head -n 1 | awk '{print $1}')
if [[ $FREE_GPU_MEMORY -ge 1000 ]]; then
NGL_FLAG="-ngl 999"
else
NGL_FLAG=""
echo "Free GPU memory is less than 600 MB. -ngl 999 flag will not be used."
fi
# Start the embedding model with llama-server
echo "Starting Embedding Model: $(basename "$EMBEDDING_MODEL")"
$LLAMA_SERVER $NGL_FLAG -v -c 8192 -ub 1024 --embedding --log-timestamps --host "$HOST" --port "$EMBEDDING_PORT" -m "$EMBEDDING_MODEL" >> "$LOG_EMBEDDINGS" 2>&1 &
echo "Embedding model started. Logs can be found at $LOG_EMBEDDINGS"
else
echo "Embedding model is already running on port $EMBEDDING_PORT. Skipping start."
fi