#!/bin/bash
# etcd_cluster_manager.sh - Place in KAMIWAZA_ROOT
set -euo pipefail

build_cluster_config() {
    # Set node name and tracking variables
    export KAMIWAZA_ETCD_NODE_NAME=${KAMIWAZA_ETCD_NODE_NAME:-$(hostname)}
    local node_name="${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${KAMIWAZA_ETCD_NODE_NAME}"
    local max_join_attempts=10
    local current_attempt=1
    local last_seen_unstarted_node=""
    local retry_delay=30
    
    if [ -n "${KAMIWAZA_HEAD_IP:-}" ] && ! ifconfig | grep inet | awk '{print $2}' | grep -q "^${KAMIWAZA_HEAD_IP}$"; then
        echo "Configuring worker node..."
        
        # Get head node hostname
        HEAD_HOSTNAME=$(ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} hostname)
        if [ $? -ne 0 ] || [ -z "${HEAD_HOSTNAME}" ]; then
            echo "Error: Failed to resolve head node's hostname via SSH"
            exit 1
        fi

        while [ $current_attempt -le $max_join_attempts ]; do
            echo "Join attempt $current_attempt of $max_join_attempts"
            
            # Check if there's already an unstarted node in the cluster
            # Get hex ID directly from simple format (avoids JSON precision loss)
            unstarted_nodes=$(ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                --cacert=/etc/etcd/certs/ca.pem \
                member list | grep unstarted | awk -F',' '{print \$1}'" || echo "")

            # Check if we've seen the same unstarted node before
            if [ -n "$unstarted_nodes" ]; then
                echo "Found unstarted nodes: $unstarted_nodes"
                
                # Is there an unstarted node for our name or URL?
                # Get hex ID directly from simple format (avoids JSON precision loss)
                local our_unstarted=$(ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                    "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                    --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                    --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                    --cacert=/etc/etcd/certs/ca.pem \
                    member list | grep '${node_name}' | awk -F',' '{print \$1}'" || echo "")
                
                if [ -n "$our_unstarted" ]; then
                    # This is our own stale entry
                    echo "Found our own stale entry (ID: $our_unstarted), removing it"

                    ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                        "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                        --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                        --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                        --cacert=/etc/etcd/certs/ca.pem \
                        member remove $our_unstarted"
                    
                    # Sleep briefly to let the cluster stabilize
                    sleep 5
                    continue  # Go back to the start of the loop to check again
                elif [ -n "$last_seen_unstarted_node" ] && [ "$unstarted_nodes" = "$last_seen_unstarted_node" ]; then
                    # We've seen the same unstarted node twice - it's likely stuck
                    echo "Same unstarted node seen again, attempting to remove it to avoid deadlock"

                    ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                        "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                        --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                        --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                        --cacert=/etc/etcd/certs/ca.pem \
                        member remove $unstarted_nodes"
                    
                    # Sleep briefly to let the cluster stabilize
                    sleep 5
                    last_seen_unstarted_node=""
                    continue  # Go back and check again
                elif [ "$current_attempt" -eq 1 ]; then
                    # First time encountering an unstarted node that isn't ours
                    echo "Another node is trying to join, waiting for it to complete or timeout"
                    last_seen_unstarted_node="$unstarted_nodes"
                    sleep $retry_delay
                    current_attempt=$((current_attempt + 1))
                    continue
                else
                    # We've seen different unstarted nodes - let's remember this one and continue
                    last_seen_unstarted_node="$unstarted_nodes"
                    echo "Different unstarted node detected, waiting for it to complete"
                    sleep $retry_delay
                    current_attempt=$((current_attempt + 1))
                    continue
                fi
            fi
            
            # No unstarted nodes or we've handled them - proceed with registration
            
            # Try to get the current cluster size BEFORE adding this node
            initial_members_count=$(ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                --cacert=/etc/etcd/certs/ca.pem \
                member list -w json | jq '.members | length'" || echo "0")

            # Try to register this node with the cluster via the head node
            echo "Registering node with cluster..."
            if ! ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                "cd ${KAMIWAZA_ROOT} && bash add_node_to_etcd_cluster.sh ${node_name}"; then
                echo "Error: Failed to register node with cluster"
                exit 1
            fi

            # If we're joining a single-node (head-only) cluster, use fallback config
            if [[ "$initial_members_count" == "1" ]]; then
                echo "Joining as second node to head-only cluster, using hardcoded configuration"
                head_node_name="${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME}"
                export KAMIWAZA_ETCD_INITIAL_CLUSTER="${node_name}=https://${node_name}:2380,${head_node_name}=https://${head_node_name}:2380"
                export KAMIWAZA_ETCD_CLUSTER_STATE="existing"
                break
            else
                # For clusters with already 2+ members, we can safely query again
                echo "Getting updated cluster members..."
                initial_cluster=$(ssh -i /etc/kamiwaza/ssl/cluster.key -o StrictHostKeyChecking=no ${KAMIWAZA_HEAD_IP} \
                    "docker exec ${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME} etcdctl \
                    --cert=/etc/etcd/certs/peer-${HEAD_HOSTNAME}.pem \
                    --key=/etc/etcd/certs/peer-${HEAD_HOSTNAME}-key.pem \
                    --cacert=/etc/etcd/certs/ca.pem \
                    member list -w json | jq -r '.members[] | (.name // \"${node_name}\") + \"=\" + .peerURLs[0]' | paste -sd ',' -" || echo "")
                
                # Validate that the initial cluster isn't empty
                if [[ -z "$initial_cluster" ]]; then
                    echo "Error: Failed to build initial cluster configuration"
                    
                    # Use fallback configuration when query fails
                    echo "Using fallback configuration for join"
                    head_node_name="${KAMIWAZA_ENV:-default}_kamiwaza-etcd-${HEAD_HOSTNAME}"
                    export KAMIWAZA_ETCD_INITIAL_CLUSTER="${node_name}=https://${node_name}:2380,${head_node_name}=https://${head_node_name}:2380"
                    export KAMIWAZA_ETCD_CLUSTER_STATE="existing"
                    break
                fi
                
                export KAMIWAZA_ETCD_CLUSTER_STATE="existing"
                export KAMIWAZA_ETCD_INITIAL_CLUSTER="${initial_cluster}"
                
                # Success, break out of the loop
                break
            fi
        done
        
        # Check if we've exhausted all attempts
        if [ $current_attempt -gt $max_join_attempts ]; then
            echo "Error: Failed to join the cluster after $max_join_attempts attempts"
            exit 1
        fi
    else
        echo "Configuring head node..."
        export KAMIWAZA_ETCD_CLUSTER_STATE="new"
        export KAMIWAZA_ETCD_INITIAL_CLUSTER="${node_name}=https://${node_name}:2380"
    fi
    
    export KAMIWAZA_ETCD_ADVERTISE_PEER_URLS="https://${node_name}:2380"
    export KAMIWAZA_ETCD_ADVERTISE_CLIENT_URLS="https://${node_name}:2379"
    
    echo "Etcd configuration generated:"
    echo "Node Name: ${KAMIWAZA_ETCD_NODE_NAME}"
    echo "Initial Cluster: ${KAMIWAZA_ETCD_INITIAL_CLUSTER}"
    echo "Cluster State: ${KAMIWAZA_ETCD_CLUSTER_STATE}"
    echo "Advertise Peer URLs: ${KAMIWAZA_ETCD_ADVERTISE_PEER_URLS}"
    echo "Advertise Client URLs: ${KAMIWAZA_ETCD_ADVERTISE_CLIENT_URLS}"
}

# Execute if run directly, allow sourcing without execution
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
    build_cluster_config
fi