#!/bin/bash -x

# Variables
EMBEDDING_MODEL="/mnt/nvme0n1/LLM/nomic-ai/quantized/nomic-embed-text-v1.5-Q8_0.gguf"
LLAMA_SERVER="/usr/local/bin/llama-server"
LOG_EMBEDDINGS="$HOME/tmp/llm-embeddings.log"
EMBEDDING_PORT=9999

# Get the host interface
HOST=$(/home/data1/protected/bin/rcd/rcd-get-ethernet-interface.sh)

# Check if the embedding port is already listening
if ! ss -tuln | grep -q ":$EMBEDDING_PORT "; then
    # Kill any running instance of llama-server for embeddings
    pkill -f "llama-server --port $EMBEDDING_PORT"

    # Check if free GPU memory is at least 600 MB
    FREE_GPU_MEMORY=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader | head -n 1 | awk '{print $1}')
    if [[ $FREE_GPU_MEMORY -ge 1000 ]]; then
        NGL_FLAG="-ngl 999"
    else
        NGL_FLAG=""
        echo "Free GPU memory is less than 600 MB. -ngl 999 flag will not be used."
    fi

    # Start the embedding model with llama-server
    echo "Starting Embedding Model: $(basename "$EMBEDDING_MODEL")"
    $LLAMA_SERVER $NGL_FLAG -v -c 8192 -ub 1024 --embedding --log-timestamps --host "$HOST" --port "$EMBEDDING_PORT" -m "$EMBEDDING_MODEL" >> "$LOG_EMBEDDINGS" 2>&1 &
    echo "Embedding model started. Logs can be found at $LOG_EMBEDDINGS"
else
    echo "Embedding model is already running on port $EMBEDDING_PORT. Skipping start."
fi