#!/bin/bash

# On the server/cluster side where Ray is started:
export RAY_LOGGING_LEVEL=WARNING
export RAY_LOGGING_FORMAT="{message}"  # Simplifies format
export RAY_LOG_TO_DRIVER=0  # This is key - prevents forwarding logs to clients

# This is going to conflict on merge with Drew's changes but should be functionally equivalent
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Not setting RAY_LOG_TO_STDERR=1 or RAY_LOG_TO_DRIVER here because ray itself is spammy
# We need to test for KAMIWAZA_DEBUG_RAY at init in the app which will keep
# only *our* beloved spam flowing.
if [ -f "kamiwaza-shibboleth" ] && [ "$KAMIWAZA_DEBUG" = "true" ]; then
    export KAMIWAZA_DEBUG_RAY=true
fi

if [ "${KAMIWAZA_DEBUG_RAY:-false}" == "true" ]; then
    echo "ray: log to driver enabled"
    export RAY_LOGGING_LEVEL="DEBUG"
    export RAY_DEBUG=1
    export RAY_LOG_TO_DRIVER=1
    export RAY_DEDUP_LOGS=0
    export RAY_LOG_TO_DRIVER_EVENT_LEVEL="DEBUG"
fi
source set-kamiwaza-root.sh
# Source detect-gpu.sh
source common.sh

ensure_nofile_min 65536 || echo "warn: could not set soft nofile to 65536" >&2

# Allow higher memory utilization on DGX Spark/Grace Hopper when running core Ray.
if is_dgx_spark; then
    export RAY_memory_usage_threshold=0.99
fi

# Check if running on macOS early to avoid redundant checks
IS_MACOS=false
if [[ "$OSTYPE" == "darwin"* ]] || [[ "$(uname -s)" == "Darwin" ]]; then
    IS_MACOS=true
fi

# Function to check if a directory meets the criteria and can be written to
check_directory() {
    local dir=$1
    local min_size=$2
    
    # For /tmp subdirectories, check if parent is a mountpoint
    if [[ "$dir" == */tmp ]]; then
        local parent_dir=${dir%/tmp}
        if [ ! -d "$parent_dir" ] || ! mountpoint -q "$parent_dir"; then
            return 1
        fi
    elif [ ! -d "$dir" ] || ! mountpoint -q "$dir"; then
        return 1
    fi

    local free_space=$(df -BG "$dir" | awk 'NR==2 {print $4}' | sed 's/G//')
    if [ "$free_space" -ge "$min_size" ]; then
        # Check if we can create a directory
        local test_dir="$dir/ray_temp_test"
        if mkdir -p "$test_dir" 2>/dev/null; then
            rm -rf "$test_dir"
            echo "$dir"
            return 0
        fi
    fi
    return 1
}

# Check config file for worker status
if [ -f "/etc/kamiwaza/config/is_worker" ]; then
    IS_WORKER=$(tr -cd '[:alnum:]' < /etc/kamiwaza/config/is_worker | tr '[:upper:]' '[:lower:]')
    if [ "$IS_WORKER" = "1" ] || [ "$IS_WORKER" = "true" ]; then
        export KAMIWAZA_WORKER=1
    fi
elif [ -n "${KAMIWAZA_HEAD_IP}" ]; then
    ifconfig | grep 'inet ' | awk '{print $2}' | grep -Fx "${KAMIWAZA_HEAD_IP}" > /dev/null 2>&1
    IS_WORKER=$?
    if [ $IS_WORKER -ne 0 ]; then
        export KAMIWAZA_WORKER=1
    fi
fi

if [ -n "$KAMIWAZA_WORKER" ] ; then
    if [ -f "/etc/kamiwaza/config/head_ip" ] ; then
        FILE_HEAD_IP=$(tr -cd '[:alnum:].:' < /etc/kamiwaza/config/head_ip | tr -d '[:space:]')
    fi
    if [ -z "$KAMIWAZA_HEAD_IP" ] && [ -n "$FILE_HEAD_IP" ] ; then
        KAMIWAZA_HEAD_IP=${FILE_HEAD_IP}
    fi
    # Auto-set KAMIWAZA_KEYCLOAK_HOST from KAMIWAZA_HEAD_IP if not already set
    if [ -z "${KAMIWAZA_KEYCLOAK_HOST:-}" ] && [ -n "$KAMIWAZA_HEAD_IP" ] ; then
        export KAMIWAZA_KEYCLOAK_HOST="$KAMIWAZA_HEAD_IP"
    fi
    if [ -z "$KAMIWAZA_HEAD_IP" ] ; then
        echo "KAMIWAZA_HEAD_IP is not set, but KAMIWAZA_WORKER is set -  you MUST pass it or place it in /etc/kamiwaza/config/head_ip"
        exit 1
    fi
    # Check if running on Darwin (macOS) - we don't support workers on macOS
    if [ "$IS_MACOS" = true ]; then
        echo "ERROR: Ray workers are not supported on macOS/Darwin systems"
        exit 1
    fi
    if [ -n "$KAMIWAZA_ROOT" ] ; then
        touch "$KAMIWAZA_ROOT/ray-is-worker"
    fi
fi

# Function to set RAY_TEMP_DIR
set_ray_temp_dir() {
    local directories=(
        "/mnt/tmp"
        "/scratch1" "/scratch2" "/scratch3" "/scratch4" "/scratch5" "/scratch6" "/scratch7" "/scratch8" "/scratch9"
        "/mnt"
        "/opt/kamiwaza"
        "/tmp"
    )
    local selected_dir=""

    for dir in "${directories[@]}"; do
        if selected_dir=$(check_directory "$dir" 60); then
            echo "Selected $dir"
            break
        fi
    done

    # If a directory was selected, set up Ray temp directory
    if [ -n "$selected_dir" ]; then
        export RAY_TEMP_DIR="$selected_dir/ray_temp"
        if mkdir -p "$RAY_TEMP_DIR"; then
            # Determine env file location
            if [ -f "${KAMIWAZA_ROOT}/env.sh" ]; then
                env_file="${KAMIWAZA_ROOT}/env.sh"
            elif [ -f "/etc/kamiwaza/env.sh" ]; then
                env_file="/etc/kamiwaza/env.sh"
            else
                echo "No env.sh file found in ${KAMIWAZA_ROOT} or /etc/kamiwaza"
                return 1
            fi

            # Remove any existing RAY_TEMP_DIR exports from env file
            sed -i '/^export RAY_TEMP_DIR=/d' "$env_file"
            
            # Add export to env file
            echo "export RAY_TEMP_DIR=$RAY_TEMP_DIR" >> "$env_file"
            
            # Create or update ray.yaml
            mkdir -p ~/.ray
            echo "temp_dir: $RAY_TEMP_DIR" > ~/.ray/ray.yaml
            
            echo "Set RAY_TEMP_DIR to $RAY_TEMP_DIR"
        else
            echo "Failed to create $RAY_TEMP_DIR. Using default Ray temporary directory."
            unset RAY_TEMP_DIR
        fi
    else
        echo "No suitable directory found for RAY_TEMP_DIR. Using default Ray temporary directory."
    fi
}

# Call the function to set RAY_TEMP_DIR
set_ray_temp_dir

# Create a dummy smi shims if they are not found, to prevent Ray from crashing
# with PermissionError on some systems (e.g. RHEL/SELinux enabled)
if ! command -v nvidia-smi &> /dev/null; then
    # Create shim directory
    SHIM_DIR="${RAY_TEMP_DIR:-/tmp}/kamiwaza_shims"
    mkdir -p "$SHIM_DIR"
    
    # Create dummy nvidia-smi
    cat <<EOF > "$SHIM_DIR/nvidia-smi"
#!/bin/sh
exit 1
EOF
    chmod +x "$SHIM_DIR/nvidia-smi"
    
    # Create dummy rocm-smi
    cat <<EOF > "$SHIM_DIR/rocm-smi"
#!/bin/sh
exit 1
EOF
    chmod +x "$SHIM_DIR/rocm-smi"

    # Create dummy hl-smi
    cat <<EOF > "$SHIM_DIR/hl-smi"
#!/bin/sh
exit 1
EOF
    chmod +x "$SHIM_DIR/hl-smi"

    # Prepend to PATH
    export PATH="$SHIM_DIR:$PATH"
    echo "Created nvidia-smi, rocm-smi, and hl-smi shims in $SHIM_DIR to bypass detection issues"
fi

# Get the directory of the script
script_dir=$(dirname "$(readlink -f "$0")")

# Add script directory to PYTHONPATH if dev file layout (package-in-tree)
# Detect by presence of package path 'kamiwaza/launch.py' without a top-level 'launch.py'
if [[ -f "$script_dir/kamiwaza/launch.py" ]] && [[ ! -f "$script_dir/launch.py" ]]; then
    if [[ ":${PYTHONPATH:-}:" != *":${script_dir}:"* ]]; then
        export PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${script_dir}"
    fi
fi

# Determine the number of CPUs
if [ "$IS_MACOS" = true ]; then
    # On macOS, limit to performance cores if possible
    num_cpus=$(sysctl -n hw.perflevel0.logicalcpu_max)
    if [ -z "$num_cpus" ]; then
        num_cpus=$(sysctl -n hw.ncpu)
    fi
else
    num_cpus=$(grep -c ^processor /proc/cpuinfo)
fi

# Determine the number of GPUs
if [ "$IS_MACOS" = true ]; then
    # macOS: Set artificially high GPU count for Metal acceleration
    # This allows Kamiwaza serving to treat macOS as having "unlimited" GPU compute
    num_gpus=999
elif [ "$(detect_gpu_compute)" = "true" ]; then
    if command -v nvidia-smi &> /dev/null; then
        num_gpus=$(nvidia-smi --list-gpus | wc -l)
    elif command -v hl-smi &> /dev/null; then
        # Check for Habana Gaudi GPUs
        num_gpus=$(hl-smi -d MEMORY -Q memory.total -f csv,noheader | wc -l)
    elif command -v clinfo &> /dev/null; then
        num_gpus=$(clinfo | grep -c "Device Type.*GPU")
    else
        num_gpus=1  # At least one GPU is available since detect_gpu_compute returned true
    fi
else
    num_gpus=0
fi

# Override with environment variables if set
if [ -n "$KAMIWAZA_NUM_CPUS" ]; then
    num_cpus=$KAMIWAZA_NUM_CPUS
fi
if [ -n "$KAMIWAZA_NUM_GPUS" ]; then
    num_gpus=$KAMIWAZA_NUM_GPUS
fi

# Calculate kamiwaza_gpus as 100 * num_gpus
kamiwaza_gpus=$((num_gpus * 100))

# Set Ray port from environment variable or default to 6379
ray_port=${KAMIWAZA_RAY_PORT:-6379}

uv_run=(uv --project "${KAMIWAZA_ROOT:-$script_dir}" run --frozen -- ray start --num-cpus "$num_cpus" --disable-usage-stats)

if [ -n "$KAMIWAZA_HEAD_IP" ] && [ -n "$KAMIWAZA_WORKER" ]; then
    # Worker node configuration
    uv_run+=("--address" "$KAMIWAZA_HEAD_IP:$ray_port")
else
    # Head node configuration
    uv_run+=("--head" "--port" "$ray_port" "--dashboard-host" "0.0.0.0")
fi

if [ "$num_gpus" -gt 0 ]; then
    uv_run+=("--num-gpus" "$num_gpus")
    uv_run+=("--resources" "{\"kamiwaza_gpus\": $kamiwaza_gpus}")
fi

if [ -n "$RAY_TEMP_DIR" ]; then
    uv_run+=("--temp-dir" "$RAY_TEMP_DIR")
fi

# Limit object store memory on DGX Spark/UMA systems to avoid memory pressure.
# Default 2GB for UMA, respect RAY_OBJECT_STORE_MEMORY_OVERRIDE if set.
if [ -n "$RAY_OBJECT_STORE_MEMORY_OVERRIDE" ]; then
    uv_run+=("--object-store-memory" "$RAY_OBJECT_STORE_MEMORY_OVERRIDE")
elif is_dgx_spark; then
    uv_run+=("--object-store-memory" "2147483648")
fi

echo "Starting Ray with command: ${uv_run[*]}"
"${uv_run[@]}"
ray_exit_code=$?

if [ $ray_exit_code -ne 0 ]; then
    echo "Ray failed to start with exit code $ray_exit_code"
fi

exit $ray_exit_code
