#!/usr/bin/env python3

"""
Improved Master Service Script (kamiwazad)

This script acts as the master service for controlling other services.
It is designed to be installed and managed via systemd/systemctl.

Features:
- Start, monitor, and restart services
- Kill orphan processes
- Log all activities
- Send alerts when critical issues occur
- Provide status updates for the services
- Track PIDs of subprocesses for reliable monitoring
- Graceful shutdown and restart mechanisms
- Monitor Docker containers and Ray service

Requirements:
- The script should start the necessary services in the correct order.
- It should monitor the services and restart them if they fail.
- It should check for and kill orphan processes.
- It should log all activities to a log file.
- It should send alerts when critical failures occur3V49F7.
- It should provide status updates for the services.
"""

import subprocess
import logging
import time
import os
import platform
import select
import signal
import sys
import json
import traceback
import venv 
import threading
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple, Union, cast
import yaml
from dotenv import load_dotenv
import psutil
import argparse
from dataclasses import dataclass
from typing import Iterator, IO
from enum import Enum
import fcntl
import errno

# Kamiwaza secretary logging (telemetry and enterprise logging)
secretary_logger_available = False
try:
    from kamiwaza.logger.events import EventName
    from secretary_setup import kamiwaza_logger_instance, get_logger
    secretary_logger_available = True
except ImportError:
    import logging
    logging.getLogger("kamiwaza").info("Secretary logger not available, telemetry logging will be skipped")

# Modern logging setup
try:
    # Add Kamiwaza root to Python path if not already there
    kamiwaza_root = os.getenv('KAMIWAZA_ROOT')
    if kamiwaza_root and kamiwaza_root not in sys.path:
        sys.path.insert(0, kamiwaza_root)
    from kamiwaza.lib.logging import setup_logging
    modern_logging_available = True
    subprocess_capture_available = True
except ImportError:
    modern_logging_available = False
    subprocess_capture_available = False

# OTEL and Metrics setup
otel_available = False
metrics_available = False
try:
    from kamiwaza.lib.otel_config import otel_config
    from kamiwaza.lib.metrics import metrics_config, metrics_helper
    from opentelemetry.trace import Status, StatusCode
    otel_available = True
    metrics_available = True
except ImportError:
    pass


def _env_flag(env_var: str) -> Optional[bool]:
    """Return boolean interpretation of an environment variable if recognisable."""
    raw_value = os.getenv(env_var)
    if raw_value is None:
        return None

    normalised = raw_value.strip().lower()
    if normalised in {"1", "true", "yes", "on"}:
        return True
    if normalised in {"0", "false", "no", "off"}:
        return False
    return None


def should_enable_syslog() -> bool:
    """Determine whether syslog should be enabled based on environment context."""
    disable_syslog = _env_flag("KAMIWAZA_DISABLE_SYSLOG")
    if disable_syslog is True:
        return False
    if disable_syslog is False:
        return True

    # Fall back to platform heuristics when explicit flag not provided
    running_in_container = _env_flag("KAMIWAZA_RUNNING_IN_CONTAINER")
    if running_in_container is None:
        running_in_container = False

    is_development_environment = platform.system() == "Darwin" or running_in_container
    return not is_development_environment

# Exception handler setup
exception_handler_available = False
try:
    from kamiwaza.lib.logging.exception_handler import install_exception_handlers
    exception_handler_available = True
except ImportError:
    pass


# Load environment variables
load_dotenv()

# Determine whether we are allowed to fall back to basic logging (primarily for tests)
_allow_logging_fallback = _env_flag("KAMIWAZA_ALLOW_LOGGING_FALLBACK") is True
using_fallback_logging = False

# Configure early logging - modern logging required for enterprise observability
if not modern_logging_available:
    import sys
    if _allow_logging_fallback:
        logging.basicConfig(level=logging.INFO)  # noqa: BASIC_CONFIG - intentional fallback
        early_logger = logging.getLogger("kamiwazad")
        early_logger.warning(
            "Modern logging (setup_logging) is not available. Proceeding with basic logging "
            "because KAMIWAZA_ALLOW_LOGGING_FALLBACK is enabled."
        )
        using_fallback_logging = True
    else:
        print("CRITICAL: Modern logging (setup_logging) is not available.", file=sys.stderr)
        print("Cannot continue without logging infrastructure.", file=sys.stderr)
        print("Check that kamiwaza.core.logging.config is properly installed.", file=sys.stderr)
        sys.exit(1)

if not using_fallback_logging:
    try:
        enable_syslog = should_enable_syslog()
        early_logger = setup_logging(
            service_name="kamiwazad",
            component=__name__,
            enable_syslog=enable_syslog
        )
        early_logger.debug("Modern structured logging initialized for kamiwazad")
    except Exception as e:
        import sys
        import traceback
        if _allow_logging_fallback:
            logging.basicConfig(level=logging.INFO)  # noqa: BASIC_CONFIG - error recovery fallback
            early_logger = logging.getLogger("kamiwazad")
            early_logger.warning(
                "Logging initialization failed during kamiwazad startup. "
                "Proceeding with basic logging fallback: %s",
                e,
            )
            using_fallback_logging = True
        else:
            print("CRITICAL: Logging initialization failed during kamiwazad startup.", file=sys.stderr)
            print(f"Error: {e}", file=sys.stderr)
            print("Traceback:", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            print("\nCannot continue without logging. Check configuration and environment.", file=sys.stderr)
            sys.exit(1)

early_logger.debug("Loading environment variables and initializing kamiwazad")

# Get KAMIWAZA_ROOT from environment variable
early_logger.debug("Setting KAMIWAZA_ROOT from environment")
KAMIWAZA_ROOT = os.getenv('KAMIWAZA_ROOT')

# Only validate and change directory if running as main script, not when imported
if __name__ == '__main__':
    if not KAMIWAZA_ROOT:
        raise EnvironmentError("KAMIWAZA_ROOT environment variable is not set")
    os.chdir(KAMIWAZA_ROOT)

service_states: Dict[str, str] = {}
container_startup_times: Dict[str, float] = {}  # Track when containers were last started

def is_worker_node() -> bool:
    """Determine if we are a worker node or not"""
    worker_status = os.getenv('KAMIWAZAD_IS_WORKER', '0')
    return worker_status != '0'

def get_pids_by_name(process_identifier: Union[str, List[str]], full_cmdline: bool = False) -> List[int]:
    """
    Get PIDs of processes matching the given identifier(s).
    
    Args:
        process_identifier: String or list of strings to match against process names/cmdlines
        full_cmdline: If True, match against full command line instead of just executable name
        
    Returns:
        List of matching PIDs
    """
    if isinstance(process_identifier, str):
        process_identifiers = [process_identifier]
    else:
        process_identifiers = process_identifier
        
    matching_pids = []
    
    # Get our own process hierarchy to avoid killing parents/ourselves
    current_pid = os.getpid()
    parent_pids = []
    try:
        # Build list of our parent process chain
        current = current_pid
        while current > 1:  # Avoid including init
            try:
                parent = psutil.Process(current).ppid()
                parent_pids.append(parent)
                current = parent
            except psutil.NoSuchProcess:
                break
    except Exception:
        logger.warning("Could not determine parent PIDs - proceeding with caution")
    
    safe_pids = set([current_pid] + parent_pids)
    
    try:
        for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
            try:
                # Skip our process and all parent processes
                if proc.pid in safe_pids:
                    continue
                    
                # Get full command line if requested
                if full_cmdline:
                    try:
                        cmdline = ' '.join(proc.cmdline())
                    except (psutil.NoSuchProcess, psutil.AccessDenied):
                        continue
                    proc_name = cmdline
                else:
                    proc_name = proc.name()
                
                # Check if process matches any of the identifiers
                for identifier in process_identifiers:
                    if identifier in proc_name:
                        matching_pids.append(proc.pid)
                        break
                        
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
                
        return matching_pids
        
    except Exception as e:
        logger.error(f"Error getting PIDs: {e}")
        return []

def expand_env_vars(value):
    """Expand environment variables in a string or dictionary."""
    if isinstance(value, str):
        # Handle shell-style variable expansion
        if value.startswith("${") and ":" in value and "}" in value:
            var_name = value[2:value.index(":")]
            modifier = value[value.index(":"):value.index("}")]
            default_value = value[value.index(":")+2:value.index("}")]
            
            env_value = os.getenv(var_name)
            if modifier == ":-":
                # Use default if var is unset or empty
                return default_value if not env_value else env_value
            elif modifier == ":+":
                # Use alternate value only if var is set and not empty
                return default_value if env_value else ""
        return os.path.expandvars(value)
    elif isinstance(value, dict):
        return {k: expand_env_vars(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [expand_env_vars(item) for item in value]
    return value


# Load configuration (only when running as main script or when KAMIWAZA_ROOT is available)
config: Optional[Dict[str, Any]] = None

if KAMIWAZA_ROOT:
    config_path = os.path.join(KAMIWAZA_ROOT, 'startup/kamiwaza.yml')
    try:
        with open(config_path, 'r', encoding='utf-8') as f:
            config = yaml.safe_load(f)

        if config is None:
            raise ValueError(f"Configuration file is empty or invalid: {config_path}")

        # Expand environment variables in the config
        config = expand_env_vars(config)

    except FileNotFoundError:
        raise FileNotFoundError(f"Required configuration file not found: {config_path}")
    except yaml.YAMLError as e:
        raise ValueError(f"Invalid YAML in configuration file {config_path}: {e}")
    except Exception as e:
        raise RuntimeError(f"Failed to load configuration from {config_path}: {e}")


def get_service_config(service_name: str) -> Dict[str, Any]:
    """Return the parsed configuration for a specific service."""
    if config is None:
        raise RuntimeError("Runtime configuration has not been loaded")

    services_section = config.get('services')
    if not isinstance(services_section, dict):
        raise KeyError("Configuration is missing a services section")

    service_cfg = services_section.get(service_name)
    if not isinstance(service_cfg, dict):
        raise KeyError(f"Configuration for service '{service_name}' is unavailable")

    return cast(Dict[str, Any], service_cfg)


def get_services_section() -> Dict[str, Dict[str, Any]]:
    if config is None:
        raise RuntimeError("Runtime configuration has not been loaded")

    services_section = config.get('services')
    if not isinstance(services_section, dict):
        raise KeyError("Configuration is missing a services section")

    return cast(Dict[str, Dict[str, Any]], services_section)


# Configure logging - modern logging required for enterprise observability
# Modern logging was already verified available at module initialization unless we fell back earlier
logger = early_logger
if not using_fallback_logging:
    try:
        # Use modern structured logging (will handle log files automatically)
        enable_syslog = should_enable_syslog()
        logger = setup_logging(
            service_name="kamiwazad",
            component="daemon",
            enable_syslog=enable_syslog,
            force_reconfigure=True,
        )
        logger.debug("Kamiwazad daemon logging configured with modern structured system")
    except Exception as e:
        import sys
        import traceback
        if _allow_logging_fallback:
            logger = early_logger
            logger.warning(
                "Daemon logging configuration failed; continuing with basic logging fallback: %s",
                e,
            )
            using_fallback_logging = True
        else:
            print("CRITICAL: Daemon logging configuration failed.", file=sys.stderr)
            print(f"Error: {e}", file=sys.stderr)
            print("Traceback:", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            print("\nCannot continue without logging. Check configuration and environment.", file=sys.stderr)
            sys.exit(1)
elif logger and not logger.handlers:
    # Ensure fallback logger has sane defaults when imported under test environments
    logging.basicConfig(level=logging.INFO)  # noqa: BASIC_CONFIG - test environment fallback
    logger = logging.getLogger("kamiwazad")

# Configuration parameters
KAMIWAZA_ENV = os.getenv('KAMIWAZA_ENV', 'default')
MONITOR_INTERVAL = 60  # in seconds
RESTART_DELAY = 5  # in seconds

def setup_process_logging(service_name: str) -> str:
    """
    Set up logging for a specific process.

    Returns the path to the log file for subprocess output capture.
    Uses simplified log directory logic: KAMIWAZA_LOG_DIR or $KAMIWAZA_ROOT/logs
    """
    from kamiwaza.lib.logging.utils import get_log_directory

    log_dir = get_log_directory()
    log_filename = f'kamiwazad-{service_name}.log'
    log_file = os.path.join(log_dir, log_filename)

    # Ensure the log file can be created
    try:
        with open(log_file, 'a') as f:
            f.write('')
    except (PermissionError, OSError) as e:
        raise RuntimeError(f"Unable to create log file {log_file}: {e}")

    return log_file

class CommandState(Enum):
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"

@dataclass
class RunningCommand:
    """
    Unified interface for both blocking and non-blocking command execution.
    Provides iterators for stdout/stderr regardless of execution mode.
    """
    pid: Optional[int]
    state: CommandState
    return_code: Optional[int] = None
    _process: Optional[subprocess.Popen] = None
    _stdout_iter: Optional[Iterator[str]] = None
    _stderr_iter: Optional[Iterator[str]] = None
    _stdin: Optional[IO] = None

    @property
    def stdout(self) -> Iterator[str]:
        """Returns an iterator over stdout lines."""
        if self._stdout_iter is None:
            return iter(())
        return self._stdout_iter

    @property
    def stderr(self) -> Iterator[str]:
        """Returns an iterator over stderr lines."""
        if self._stderr_iter is None:
            return iter(())
        return self._stderr_iter

def create_nonblocking_pipe_reader(pipe: Optional[IO[bytes]], process: subprocess.Popen[Any]) -> Iterator[str]:
    """
    Creates a non-blocking pipe reader that yields lines as they become available.
    Handles both immediate and future data without blocking indefinitely.
    """
    if pipe is None:
        return iter(())

    def _reader() -> Iterator[str]:
        # Set pipe to non-blocking mode
        flags = fcntl.fcntl(pipe, fcntl.F_GETFL)
        fcntl.fcntl(pipe, fcntl.F_SETFL, flags | os.O_NONBLOCK)

        buffer = ""

        while True:
            # Check if process has ended and pipe is empty
            if process.poll() is not None:
                if not select.select([pipe], [], [], 0)[0]:
                    if buffer:
                        yield buffer
                    break

            try:
                # Check for immediately available data
                ready, _, _ = select.select([pipe], [], [], 0)
                if ready:
                    chunk = os.read(pipe.fileno(), 4096)
                    if not chunk:  # EOF
                        if buffer:
                            yield buffer
                        break

                    buffer_local = buffer + chunk.decode('utf-8')
                    while '\n' in buffer_local:
                        line, buffer_local = buffer_local.split('\n', 1)
                        yield line
                    buffer = buffer_local
                else:
                    time.sleep(0.01)
            except (IOError, OSError) as err:
                if err.errno != errno.EAGAIN:
                    logger.error(f"Pipe read error: {err}")
                    raise
                time.sleep(0.01)  # Prevent tight loop when no data

    return _reader()

def create_blocking_pipe_reader(output: str) -> Iterator[str]:
    """Creates an iterator over lines from a completed command's output."""
    yield from output.splitlines()

def write_subprocess_log(log_file: str, data: bytes) -> None:
    """
    Write subprocess output to log file.
    """
    with open(log_file, 'ab') as f:
        f.write(data)
        f.flush()

def run_command(
    command: Union[str, Dict[str, str]],
    cwd: Optional[str] = None,
    input: Optional[str] = None,
    log_file: Optional[str] = None,
    active_venv: Optional[str] = None,
    blocking: bool = True,
    env: Optional[Dict[str, str]] = None
) -> RunningCommand:
    """
    Unified command runner that handles both blocking and non-blocking cases.
    
    Args:
        command: The command to run
        cwd: Working directory for the command
        input: Optional input to pass to the command
        log_file: Optional file to redirect stdout to
        active_venv: Optional path to virtual environment activate script
        blocking: Whether to run in blocking mode
        env: Optional environment variables
        
    Returns:
        RunningCommand object with access to process state and output
    """
    # Handle dict command by checking environment
    if isinstance(command, dict):
        # Check if we're in development by looking for KAMIWAZA_DEBUG
        is_dev = os.getenv('KAMIWAZA_DEBUG', 'false') == 'true'
        env_type = 'development' if is_dev else 'production'
        command = command[env_type]

    logger.debug(f"Running command: {command} (blocking={blocking})")

    if active_venv:
        command = f"source {active_venv} && {command}"
    
    try:
        if blocking:
            result = subprocess.run(
                command,
                shell=True,
                executable='/bin/bash',
                cwd=cwd,
                input=input,
                capture_output=True,
                text=True,
                env=env
            )

            # If a log file is specified, write the output with dual-write support
            if log_file:
                if result.stdout:
                    write_subprocess_log(log_file, result.stdout.encode('utf-8'))
                if result.stderr:
                    write_subprocess_log(log_file, result.stderr.encode('utf-8'))

            return RunningCommand(
                pid=None,  # Process already completed
                state=CommandState.COMPLETED if result.returncode == 0 else CommandState.FAILED,
                return_code=result.returncode,
                _stdout_iter=create_blocking_pipe_reader(result.stdout),
                _stderr_iter=create_blocking_pipe_reader(result.stderr)
            )
        else:
            # Modified non-blocking setup
            process = subprocess.Popen(
                command,
                shell=True,
                executable='/bin/bash',
                cwd=cwd,
                stdin=subprocess.PIPE if input else None,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=False,  # Changed to False for unbuffered output
                bufsize=0,   # Disable buffering
                env=env
            )
            
            if input and process.stdin:
                process.stdin.write(input.encode('utf-8'))
                process.stdin.flush()
            
            # Modified tee_output function with dual-write support
            if log_file:
                def tee_output():
                    while process.poll() is None:  # Check if process is still running
                        # Read from stdout
                        if process.stdout:
                            ready, _, _ = select.select([process.stdout], [], [], 0.1)
                            if ready:
                                data = os.read(process.stdout.fileno(), 4096)
                                if data:
                                    write_subprocess_log(log_file, data)

                        # Read from stderr
                        if process.stderr:
                            ready, _, _ = select.select([process.stderr], [], [], 0.1)
                            if ready:
                                data = os.read(process.stderr.fileno(), 4096)
                                if data:
                                    write_subprocess_log(log_file, data)

                    # Drain any remaining output after process ends
                    for pipe in [process.stdout, process.stderr]:
                        if pipe:
                            data = pipe.read()
                            if data:
                                write_subprocess_log(log_file, data)

                threading.Thread(target=tee_output, daemon=True).start()
            
            return RunningCommand(
                pid=process.pid,
                state=CommandState.RUNNING,
                _process=process,
                _stdout_iter=create_nonblocking_pipe_reader(process.stdout, process),
                _stderr_iter=create_nonblocking_pipe_reader(process.stderr, process),
                _stdin=process.stdin
            )
            
    except Exception as e:
        logger.error(f"Command execution failed: {str(e)}")
        return RunningCommand(
            pid=None,
            state=CommandState.FAILED,
            return_code=-1,
            _stdout_iter=create_blocking_pipe_reader(""),
            _stderr_iter=create_blocking_pipe_reader(str(e))
        )


def ensure_container_images(service_name: str, service_config: Dict) -> bool:
    """
    Ensure container images are available locally before startup.
    This separates image download time from container startup time.
    
    Returns True if images are available or successfully pulled, False otherwise.
    """
    try:
        working_dir_value = service_config.get('working_directory', KAMIWAZA_ROOT)
        working_dir = cast(str, working_dir_value or KAMIWAZA_ROOT)
        
        # Check if this looks like a first-time startup by seeing if any kamiwaza containers exist
        logger.info(f"Checking if container images need to be pulled for {service_name}...")
        
        # Get list of existing containers
        result = run_command(
            "docker compose ls -a --format json", 
            cwd=working_dir, 
            env=os.environ.copy()
        )
        
        if result.state == CommandState.COMPLETED:
            stdout = '\n'.join(result.stdout)
            if stdout.strip():
                try:
                    containers = json.loads(stdout)
                    env_name = os.getenv('KAMIWAZA_ENV', 'default')
                    kamiwaza_containers = [c for c in containers if c['Name'].startswith(f'{env_name}-kamiwaza-')]
                    
                    if kamiwaza_containers:
                        logger.info(f"Found existing containers for {service_name}, skipping image pull")
                        return True
                    else:
                        logger.info(f"No existing containers found for {service_name}, checking images...")
                except json.JSONDecodeError:
                    logger.debug("Could not parse container list, will check images...")
        
        # Check if we have the required images locally
        logger.info(f"Checking for required Docker images for {service_name}...")
        
        # For worker nodes, we need fewer images
        if is_worker_node() and service_name == 'containers-worker':
            required_components = ['etcd', 'traefik']
        else:
            # Check if we're in lite mode
            lite_mode = os.getenv('KAMIWAZA_LITE', 'false').lower() == 'true'
            if lite_mode:
                required_components = ['etcd', 'milvus', 'traefik']
            else:
                required_components = ['cockroachdb', 'datahub', 'etcd', 'milvus', 'traefik']
        
        missing_images = []
        for component in required_components:
            compose_dir = f"kamiwaza/deployment/{component}/standard"
            compose_file = os.path.join(working_dir, compose_dir, "docker-compose.yml")
            
            if os.path.exists(compose_file):
                # Use docker compose config to get the images
                config_result = run_command(
                    f"docker compose -f {compose_file} config --images",
                    cwd=working_dir,
                    env=os.environ.copy()
                )
                
                if config_result.state == CommandState.COMPLETED:
                    images = [img.strip() for img in config_result.stdout if img.strip()]
                    for image in images:
                        # Check if image exists locally
                        inspect_result = run_command(
                            f"docker image inspect {image}",
                            env=os.environ.copy()
                        )
                        if inspect_result.state != CommandState.COMPLETED:
                            missing_images.append((component, image))
        
        if missing_images:
            logger.info(f"Found {len(missing_images)} missing images for {service_name}, pulling...")
            logger.info("This may take several minutes on first startup or with slower network connections...")
            
            # Pull images for each component that has missing images
            components_to_pull = set(comp for comp, _ in missing_images)
            
            for component in components_to_pull:
                logger.info(f"Pulling images for component: {component}")
                compose_dir = f"kamiwaza/deployment/{component}/standard"
                
                pull_result = run_command(
                    "docker compose -f docker-compose.yml pull",
                    cwd=os.path.join(working_dir, compose_dir),
                    env=os.environ.copy()
                )
                
                if pull_result.state != CommandState.COMPLETED:
                    logger.error(f"Failed to pull images for {component}")
                    # Log any error output
                    for line in pull_result.stderr:
                        logger.error(f"Pull error: {line.rstrip()}")
                    return False
                else:
                    logger.info(f"Successfully pulled images for {component}")
            
            logger.info(f"All required images pulled successfully for {service_name}")
        else:
            logger.info(f"All required images already available locally for {service_name}")
        
        return True
        
    except Exception as e:
        logger.error(f"Error ensuring container images for {service_name}: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        # Don't fail startup just because image check failed - let docker compose handle it
        logger.warning("Continuing with startup despite image check failure...")
        return True

def record_container_startup():
    """Record that containers are starting up now."""
    service_name = 'containers-worker' if is_worker_node() else 'containers'
    startup_time = time.time()
    
    # Store in global dict for daemon process
    global container_startup_times
    container_startup_times[service_name] = startup_time
    
    # Also store in file for cross-process access
    startup_file = f"/tmp/kamiwaza_container_startup_{service_name}"
    try:
        with open(startup_file, 'w') as f:
            f.write(str(startup_time))
        logger.info(f"Recorded startup time for {service_name} (time: {startup_time})")
    except Exception as e:
        logger.warning(f"Failed to write startup time to file: {e}")

def is_within_startup_grace_period() -> bool:
    """Check if containers are within the startup grace period."""
    service_name = 'containers-worker' if is_worker_node() else 'containers'
    startup_time = None
    
    # Try to get from memory first (for daemon process)
    global container_startup_times
    if service_name in container_startup_times:
        startup_time = container_startup_times[service_name]
    else:
        # Try to get from file (for separate status check processes)
        startup_file = f"/tmp/kamiwaza_container_startup_{service_name}"
        try:
            if os.path.exists(startup_file):
                with open(startup_file, 'r') as f:
                    startup_time = float(f.read().strip())
        except Exception as e:
            logger.debug(f"Failed to read startup time from file: {e}")
    
    if startup_time is None:
        logger.debug(f"No startup time found for {service_name}")
        return False
    
    # Use longer timeout for first-time startup when images might be large
    # Check if we can detect this is likely a first-time startup
    first_time_startup = False
    try:
        # If no containers exist yet, this might be first time
        result = run_command("docker compose ls -a --format json", env=os.environ.copy())
        if result.state == CommandState.COMPLETED:
            stdout = '\n'.join(result.stdout)
            if stdout.strip():
                containers = json.loads(stdout)
                env_name = os.getenv('KAMIWAZA_ENV', 'default')
                kamiwaza_containers = [c for c in containers if c['Name'].startswith(f'{env_name}-kamiwaza-')]
                first_time_startup = len(kamiwaza_containers) == 0
    except Exception:
        # If we can't determine, assume it's not first time
        first_time_startup = False
    
    if first_time_startup:
        # Use longer timeout for first-time startup (default 15 minutes)
        grace_period = int(os.getenv('CONTAINER_FIRST_STARTUP_TIMEOUT', '900'))
        logger.debug(f"Using first-time startup timeout: {grace_period}s")
    else:
        # Use normal timeout for subsequent startups (default 5 minutes)  
        grace_period = int(os.getenv('CONTAINER_STARTUP_TIMEOUT', '300'))
        logger.debug(f"Using normal startup timeout: {grace_period}s")
    
    elapsed = time.time() - startup_time
    
    logger.debug(f"Container startup: service={service_name}, elapsed={elapsed:.1f}s, grace_period={grace_period}s, first_time={first_time_startup}")
    return elapsed < grace_period

def check_docker_status() -> str:
    """Check the status of Docker containers with startup grace period."""
    try:
        # First, check actual container status
        within_grace_period = is_within_startup_grace_period()

        # Determine node type
        is_worker = is_worker_node()

        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")
        services_config = cast(Dict[str, Any], config.get('services', {}))

        # Select the appropriate config based on node type
        if is_worker:
            docker_config = cast(Dict[str, Any], services_config['containers-worker'])
            logger.debug("Using containers-worker configuration")
        else:
            docker_config = cast(Dict[str, Any], services_config['containers'])
            logger.debug("Using containers configuration")
            
        working_dir = cast(str, docker_config['working_directory'])

        logger.debug("Checking Docker container status...")
        logger.debug(f"Running command: {docker_config['startup_check']['command']}")
        result = run_command(docker_config['startup_check']['command'], cwd=working_dir, env=os.environ.copy())
        
        # Add detailed debug output for the command result
        logger.debug("Docker command output:")
        logger.debug(f"  stdout: {result.stdout}")
        logger.debug(f"  stderr: {result.stderr}")
        logger.debug(f"  return_code: {result.return_code}")
        
        if result.state == CommandState.FAILED:
            logger.error(f"Docker compose command failed with code {result.return_code}")
            return 'error'

        # Collect all stdout lines into a single string
        stdout = '\n'.join(result.stdout)
        if not stdout:
            logger.error("No output from docker compose ls command")
            return 'error'

        try:
            containers = json.loads(stdout)
            logger.debug(f"Parsed container status: {json.dumps(containers, indent=2)}")
        except json.JSONDecodeError:
            logger.error("Failed to parse container status JSON")
            return 'error'

        logger.debug("Running container validation...")
        logger.debug(f"Running validation command: {docker_config['startup_check']['validation']}")
        validation_result = run_command(
            docker_config['startup_check']['validation'], 
            cwd=working_dir,
            input=stdout,
            env=os.environ.copy()
        )
        
        # Add detailed debug output for validation
        logger.debug("Validation command output:")
        logger.debug(f"  stdout: {list(validation_result.stdout)}")  # Convert generator to list
        logger.debug(f"  stderr: {list(validation_result.stderr)}")  # Convert generator to list
        logger.debug(f"  return_code: {validation_result.return_code}")
        
        if validation_result.state == CommandState.COMPLETED:
            logger.debug("All containers are in expected state")
            return 'running'
        elif validation_result.return_code == 2:
            # Exit code 2 indicates containers are starting (not an error)
            logger.debug("Containers are starting up")
            stderr_output = '\n'.join(list(validation_result.stderr))  # Convert generator to list
            if stderr_output:
                logger.debug(f"Container startup status: {stderr_output}")
            return 'starting'
        else:
            # Container validation failed - check if we should apply grace period
            if within_grace_period:
                logger.debug("Containers failed validation but within startup grace period - returning 'starting'")
                return 'starting'
            else:
                logger.debug("Containers not in expected state")
                stderr_output = '\n'.join(list(validation_result.stderr))  # Convert generator to list
                if stderr_output:
                    logger.debug(f"Validation error: {stderr_output}")
                return 'error'

    except Exception as e:
        logger.error(f"Error checking docker status: {str(e)}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        return 'error'

def is_service_enabled(service_config: dict) -> bool:
    """
    Evaluate if a service is enabled. Handles:
    - boolean values (True/False)
    - string values ("true"/"false")
    - shell commands that return 0/1
    - environment variable references
    """
    enabled = service_config.get('enabled', True)
    
    # Handle boolean values
    if isinstance(enabled, bool):
        return enabled
        
    # Handle string values
    if isinstance(enabled, str):
        # Check for literal true/false
        if enabled.lower() == 'true':
            return True
        if enabled.lower() == 'false':
            return False
            
        # If it starts with $, treat as environment variable
        if enabled.startswith('$'):
            var_name = enabled.lstrip('${}')
            return os.environ.get(var_name, '').lower() == 'true'
            
        # Otherwise treat as command to evaluate
        try:
            result = run_command(enabled, blocking=True, env=os.environ.copy())
            return result.return_code == 0
        except Exception as e:
            logger.error(f"Error evaluating enabled command: {e}")
            return False
            
    return bool(enabled)

def check_service_startup(service_name: str, service_config: Dict) -> bool:
    """Check if a service has started correctly using its startup check."""
    if 'startup_check' not in service_config:
        logger.debug(f"No startup check defined for {service_name}")
        return True

    try:
        check_cmd = service_config['startup_check']['command']
        logger.debug(f"Running startup check command for {service_name}: {check_cmd}")
        
        check_result = run_command(
            check_cmd,
            cwd=service_config.get('working_directory'),
            active_venv=service_config.get('venv'),
            blocking=True
        )

        # Collect all output first
        output_lines = list(check_result.stdout)
        raw_output = '\n'.join(output_lines)

        # Debug logging
        if service_config.get('debug', False):
            logger.debug(f"{service_name} check command output:")
            for line in output_lines:
                logger.debug(f"  stdout: {line.rstrip()}")
            for line in check_result.stderr:
                logger.debug(f"  stderr: {line.rstrip()}")

        if check_result.state != CommandState.COMPLETED:
            logger.error(f"Startup check command failed for {service_name}")
            return False

        if 'validation' in service_config['startup_check']:
            validation_script = service_config['startup_check']['validation']
            logger.debug(f"Running validation script for {service_name}")
            
            validation_result = run_command(
                validation_script,
                input=raw_output,
                cwd=service_config.get('working_directory'),
                active_venv=service_config.get('venv'),
                blocking=True,
                env=os.environ.copy()
            )

            if validation_result.return_code == 0:
                logger.info(f"Service {service_name} startup verified successfully")
                return True
            elif validation_result.return_code == 2 and service_name in ['containers', 'containers-worker']:
                # Exit code 2 for containers means "starting" - acceptable for startup check
                logger.info(f"Service {service_name} containers are starting up")
                return True
            elif any(msg in line.lower() for line in validation_result.stderr for msg in ["not-applicable", "disabled"]):
                return True
            else:
                logger.error(f"{service_name} startup verification failed")
                return False

        return True

    except Exception as e:
        logger.error(f"Error checking {service_name} startup: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        return False



def is_service_ready(service_name: str) -> bool:
    """Check if a service is ready using its process check command."""
    service_config = get_service_config(service_name)
    
    # Check if service is enabled
    if not is_service_enabled(service_config):
        logger.debug(f"Service {service_name} is disabled, skipping readiness check")
        return True
    
    # For container services, use the startup check directly
    if service_name in ['containers', 'containers-worker']:
        return check_service_startup(service_name, service_config)
    
    # For other services...
    if 'process_check' not in service_config:
        return True

    try:
        check_cmd = service_config['process_check']
        result = run_command(
            check_cmd,
            cwd=service_config.get('working_directory'),
            active_venv=service_config.get('venv'),
            blocking=True,
            env=os.environ.copy()
        )
        return result.return_code == 0
    except Exception as e:
        logger.error(f"Error checking {service_name} readiness: {e}")
        return False




def start_services() -> None:
    """Start the required services in the defined order."""
    logger.info("Starting services...")
    try:
        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")

        startup_order_raw = config.get('startup_order', [])
        if not isinstance(startup_order_raw, list):
            raise TypeError("Configuration startup_order must be a list")

        services_section = get_services_section()

        for service_name in startup_order_raw:
            if not isinstance(service_name, str):
                logger.warning(f"Skipping invalid service name entry: {service_name}")
                continue

            logger.info(f"Processing service {service_name} based on startup order")
            service_config = services_section.get(service_name)
            if service_config is None:
                logger.error(f"Configuration missing service definition for {service_name}")
                continue
            
            # Use is_service_enabled consistently
            if not is_service_enabled(service_config):
                logger.info(f"Skipping disabled service: {service_name}")
                continue
            
            # Check dependencies using service_states
            if 'dependencies' in service_config:
                for dep in service_config['dependencies']:
                    while not is_service_ready(dep):
                        logger.info(f"Waiting for dependency {dep} to be ready before starting {service_name}")
                        time.sleep(5)

            # Check if service is already running if check_first is enabled
            if service_config.get('check_first', False):
                logger.info(f"Checking if {service_name} is already running")
                if check_service_startup(service_name, service_config):
                    status = check_service_status(service_name, service_config)
                    if status == 'running':
                        logger.info(f"{service_name} is already running, skipping start")
                        service_states[service_name] = 'running'
                        continue
                    logger.info(f"{service_name} check failed ({status}), will attempt to start")

            # Proceed with normal service start
            max_attempts = service_config.get('max_startup_attempts', 1)
            for attempt in range(max_attempts):
                logger.info(f"Starting {service_name} (attempt {attempt + 1}/{max_attempts})")
                
                if start_service(service_config, service_name):
                    logger.info(f"{service_name} started successfully")
                    service_states[service_name] = 'running'
                    break
                
                if attempt < max_attempts - 1:
                    logger.warning(f"{service_name} startup attempt {attempt + 1} failed, retrying...")
                    time.sleep(service_config.get('startup_check_interval', 5))
            else:
                service_states[service_name] = 'stopped'
                if service_config.get('cleanup_on_failed_start', False):
                    run_command(service_config['cleanup_on_failed_start'], cwd=service_config.get('working_directory'), active_venv=service_config.get('venv'), blocking=True, env=os.environ.copy())
                raise RuntimeError(f"Failed to start {service_name} after {max_attempts} attempts")

    except Exception as e:
        logger.error(f"Failed to start services: {str(e)}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        send_alert(f"Failed to start services: {str(e)}")
        sys.exit(1)

def stop_service(service_name: str, is_shutdown: bool = False) -> bool:
    """
    Stop a service based on its configuration.
    Returns True if service was successfully stopped or wasn't running,
    False if there was an error stopping a running service.
    """
    if config is None:
        # Config not loaded (likely in test environment)
        logger.warning("Configuration not loaded, cannot stop service")
        return True

    services_section = get_services_section()
    if service_name not in services_section:
        logger.warning(f"Unknown service: {service_name}")
        return True

    service_config = services_section[service_name]
    success = True
    debug = service_config.get('debug', False)
    
    try:
        if debug:
            logger.debug(f"Stopping {service_name}")
            
        # If we have a stop_command, use that
        if 'stop_command' in service_config:
            cmd = service_config['stop_command']
            if debug:
                logger.debug(f"Running stop command: {cmd}")
                
            result = run_command(
                cmd,
                cwd=service_config.get('working_directory'),
                active_venv=service_config.get('venv'),
                blocking=True,
                env=os.environ.copy()
            )
            
            if result.state != CommandState.COMPLETED:
                logger.error(f"Stop command for {service_name} failed")
                success = False
                
        # Handle stop_process entries (legacy support)
        elif 'stop_process' in service_config:
            logger.warning(f"Service {service_name} using deprecated stop_process configuration")
            process_identifiers = service_config['stop_process']
            if isinstance(process_identifiers, str):
                process_identifiers = [process_identifiers]
                
            if debug:
                logger.debug(f"Stopping processes matching: {process_identifiers}")
                
            # First try exact matches
            pids = get_pids_by_name(process_identifiers)
            
            # If no matches found, try matching against full command lines
            if not pids:
                pids = get_pids_by_name(process_identifiers, full_cmdline=True)
                
            if pids:
                for pid in pids:
                    if debug:
                        logger.debug(f"Killing PID {pid}")
                    if not kill_process(pid, service_name):
                        success = False
            elif debug:
                logger.debug("No matching processes found")
                
    except Exception as e:
        logger.error(f"Error stopping {service_name}: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        success = False
        
    finally:
        if is_shutdown:
            service_states[service_name] = 'stopped'
            
    return success

def stop_all_services(is_shutdown: bool = False) -> bool:
    """
    Stop all services in reverse startup order with progress indicators.
    Returns True if all stop attempts completed without errors.
    """
    import sys
    
    all_success = True
    results = []
    
    # Check if output is to a terminal for color support
    use_colors = sys.stdout.isatty()
    
    def colorize(text, color_code):
        if use_colors:
            return f"\033[{color_code}m{text}\033[0m"
        return text
    
    def print_progress(service_name, status_msg, color="37"):
        """Print progress with consistent formatting (no newline, can be overwritten)"""
        display_name = service_name.replace('_', ' ').title()
        if display_name == 'Containers Worker':
            display_name = 'Worker Containers'
        elif display_name == 'Azure Disk Mount':
            display_name = 'Azure Storage'

        print(f"  {colorize('⏳', '33')} {display_name:<20} {colorize(status_msg, color)}", end='', flush=True)
    
    def print_result(service_name, status_msg, color="37"):
        """Print final result with consistent formatting"""
        display_name = service_name.replace('_', ' ').title()
        if display_name == 'Containers Worker':
            display_name = 'Worker Containers'
        elif display_name == 'Azure Disk Mount':
            display_name = 'Azure Storage'

        if "failed" in status_msg.lower():
            symbol = colorize('✗', '31')  # Red X
        elif "not running" in status_msg.lower():
            symbol = colorize('○', '90')  # Gray circle
        else:
            symbol = colorize('✓', '32')  # Green checkmark

        print(f"  {symbol} {display_name:<20} {colorize(status_msg.upper(), color)}")
        sys.stdout.flush()  # Ensure immediate output
    
    # Header
    print()
    print(colorize("═" * 60, "36"))  # Cyan border
    print(colorize("  STOPPING KAMIWAZA SERVICES", "36;1"))  # Cyan bold
    print(colorize("═" * 60, "36"))
    print()
    
    # Build stop list excluding services that are disabled or not applicable on this node
    if config is None:
        raise RuntimeError("Runtime configuration has not been loaded")

    is_worker = is_worker_node()
    services_section = get_services_section()
    startup_order_raw = config.get('startup_order', [])
    if not isinstance(startup_order_raw, list):
        raise TypeError("Configuration startup_order must be a list")

    services_to_stop = []
    for s in reversed(startup_order_raw):
        service_cfg = services_section.get(s)
        if not service_cfg:
            continue
        # Skip services that do not apply to this node type
        if s == 'containers' and is_worker:
            continue
        if s == 'containers-worker' and not is_worker:
            continue
        # Respect dynamic enablement logic
        if not is_service_enabled(service_cfg):
            continue
        # Skip services that are not applicable on this environment
        try:
            current_status = check_service_status(s, service_cfg)
            if current_status == 'not-applicable':
                continue
        except Exception:
            # If status check fails, fall back to including the service to avoid missing a real stop
            pass
        services_to_stop.append(s)
    
    if not services_to_stop:
        print(colorize("  No services to stop", "37"))
        print()
        return True
    
    print(f"  Stopping {len(services_to_stop)} service(s) in reverse order...")
    print()
    
    for i, service_name in enumerate(services_to_stop, 1):
        logger.debug(f"Processing stop for {service_name}")

        # Get current status for logging but don't use it to skip stops
        service_cfg = services_section.get(service_name)
        if not service_cfg:
            logger.warning(f"Missing configuration for {service_name} during stop sequence")
            continue

        status = check_service_status(service_name, service_cfg)
        logger.debug(f"Current status of {service_name}: {status}")

        # Show progress
        progress_msg = f"Stopping... ({i}/{len(services_to_stop)})"
        print_progress(service_name, progress_msg, "33")  # Yellow

        # Always attempt to stop, even if status check says not running
        if service_name == 'containers':
            # Special handling for containers - always run stop command
            success = stop_service(service_name, is_shutdown)
            result_msg = 'stopped' if success else 'failed to stop'
            result_color = '32' if success else '31'  # Green or Red
        else:
            # For other services, still try to stop but report status appropriately
            if status != 'running':
                result_msg = 'not running'
                result_color = '90'  # Gray
            else:
                success = stop_service(service_name, is_shutdown)
                if success:
                    result_msg = 'stopped'
                    result_color = '32'  # Green
                else:
                    all_success = False
                    result_msg = 'failed to stop'
                    result_color = '31'  # Red

        # Update the progress line with final result
        print("\r\033[K", end="")  # Return to start of line and clear it
        sys.stdout.flush()
        print_result(service_name, result_msg, result_color)
        
        results.append(f"{service_name}: {result_msg}")
        
        # Small delay between services for readability
        if i < len(services_to_stop):
            time.sleep(0.2)
    
    print()
    print(colorize("─" * 60, "90"))  # Gray line
    
    success_count = sum(1 for r in results if 'stopped' in r)
    not_running_count = sum(1 for r in results if 'not running' in r)
    failed_count = len(results) - success_count - not_running_count
    
    if failed_count == 0:
        if success_count > 0:
            print(f"  {colorize('✓', '32')} All services stopped successfully")
        else:
            print(f"  {colorize('○', '90')} All services were already stopped")
    else:
        print(f"  {colorize('⚠', '33')} {success_count} stopped, {failed_count} failed")
    
    print(colorize("─" * 60, "90"))
    print()
    
    return all_success

def check_service_status(service_name: str, service_config: Dict) -> str:
    """Check the status of a service."""
    try:
        # Check if logger is at DEBUG level to determine verbosity
        is_debug_logging = logger.isEnabledFor(logging.DEBUG)

        # Special handling for container services
        if service_name in ['containers', 'containers-worker']:
            # Determine if we're on a worker node
            is_worker = is_worker_node()

            # Skip if this is the wrong service type for this node
            if (service_name == 'containers' and is_worker) or \
               (service_name == 'containers-worker' and not is_worker):
                return 'not-applicable'

            return check_docker_status()

        # For other services, use the process_check command if available
        if 'process_check' in service_config:
            try:
                if is_debug_logging:
                    logger.debug(f"Running process check command: {service_config['process_check']}")
                active_venv = service_config.get('venv')
                result = run_command(
                    service_config['process_check'],
                    active_venv=active_venv,
                    env=os.environ.copy()
                )

                if is_debug_logging:
                    logger.debug("=== Process Check Output ===")
                    logger.debug(f"Return code: {result.return_code}")
                    logger.debug("STDOUT:")
                    stdout_lines = list(result.stdout)
                    for line in stdout_lines:
                        logger.debug(f"  {line.rstrip()}")
                    logger.debug("STDERR:")
                    stderr_lines = list(result.stderr)
                    for line in stderr_lines:
                        logger.debug(f"  {line.rstrip()}")
                    logger.debug("=== End Process Check Output ===")

                if result.state == CommandState.COMPLETED:
                    if is_debug_logging:
                        logger.debug(f"Process check for {service_name} succeeded")
                    return 'running'
                else:
                    if is_debug_logging:
                        logger.debug(f"Process check for {service_name} failed with code {result.return_code}")
                    return 'stopped'

            except Exception as e:
                logger.error(f"Error running process check for {service_name}: {e}")
                logger.error(f"Traceback: {traceback.format_exc()}")
                return 'error'

        logger.debug(f"No process check found for service {service_name}")
        return 'not-applicable'
        
    except Exception as e:
        logger.error(f"Error checking status for {service_name}: {e}")
        return 'error'

        
        

def start_service(service_config: Dict, service_name: str) -> bool:
    """Start a service and return whether it started successfully."""
    try:
        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")

        script = service_config['script']
        target_os = service_config.get('platform', 'none').lower()
        os_type = platform.system().lower()
        if target_os != os_type and target_os != 'none':
            logger.info(f"Skipping {service_name} on {os_type} as it's configured for {target_os}")
            return True
        should_block = service_config.get('process_block', True)
        
        # Get and verify working directory
        if 'working_directory' not in service_config:
            raise ValueError(f"Working directory not specified for service {service_name}")
        
        cwd = os.path.expandvars(service_config['working_directory'])
        if not os.path.isabs(cwd):
            base_dir = cast(Optional[str], config.get('base_directory'))
            if base_dir is None:
                raise ValueError("Configuration missing base_directory")
            cwd = os.path.join(base_dir, cwd)
        
        if not os.path.isdir(cwd):
            raise FileNotFoundError(f"Working directory for {service_name} does not exist: {cwd}")

        # Get the script command
        if isinstance(script, dict):
            is_dev = os.getenv('KAMIWAZA_DEBUG', 'false') == 'true'
            env_type = 'development' if is_dev else 'production'
            if env_type not in script:
                raise ValueError(f"No script specified for environment type '{env_type}' in service {service_name}")
            script = script[env_type]

        # Handle any arguments specified in config
        # Store original script before appending args
        original_script = script

        if 'args' in service_config:
            args = service_config['args']
            if isinstance(args, list):
                normalized_args = [str(arg) for arg in args]
                script = f"{script} {' '.join(normalized_args)}"
            else:
                script = f"{script} {str(args)}"

        # Handle active_venv for Python commands
        if script.startswith('python'):
            if KAMIWAZA_ROOT is None:
                raise EnvironmentError("KAMIWAZA_ROOT is not set; cannot locate virtual environment")
            venv_path = os.path.join(KAMIWAZA_ROOT, 'venv')
            venv_python = get_venv_python(venv_path)
            script = script.replace('python', venv_python, 1)
        elif original_script.endswith('.sh') and not original_script.startswith('bash'):
            script = f"bash {original_script}"
            if 'args' in service_config:
                args = service_config['args']
                if isinstance(args, list):
                    script = f"{script} {' '.join(args)}"
                else:
                    script = f"{script} {args}"

        # Run any pre-start commands if configured
        if 'pre_start' in service_config:
            logger.info(f"Running pre-start commands for {service_name}")
            for pre_command in service_config['pre_start']:
                pre_result = run_command(pre_command, cwd=cwd)
                if pre_result.state != CommandState.COMPLETED:
                    raise RuntimeError(f"Pre-start command failed: {pre_command}")
        
        # Ensure container images are available before starting container services
        if service_name in ['containers', 'containers-worker']:
            logger.info(f"Ensuring container images are available for {service_name}...")
            if not ensure_container_images(service_name, service_config):
                logger.error(f"Failed to ensure container images for {service_name}")
                return False
            
            # Record startup time for container services AFTER image check
            record_container_startup()
        
        # Start the main service
        logger.info(f"Starting {service_name} with command: {script} in {cwd}")
        log_file = setup_process_logging(service_name)
        
        result = run_command(script, cwd=cwd, log_file=log_file, blocking=should_block, env=os.environ.copy())
        
        # For non-continuous services, wait for completion
        if should_block:
            if result.state != CommandState.COMPLETED:
                logger.error(f"{service_name} failed to complete successfully")
                return False
            
            # Give a moment for things to settle
            logger.info(f"Service {service_name} completed, waiting 5s before status check")
            time.sleep(5)
            
            # Verify the service started correctly
            if is_service_ready(service_name):
                logger.info(f"Service {service_name} startup verified successfully")
                return True
            else:
                logger.error(f"{service_name} startup verification failed")
                return False

        return True  # For continuous services, just return True if we got here

    except Exception as e:
        logger.error(f"Error starting {service_name}: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        return False

# Function to send alerts
def send_alert(message: str) -> None:
    """
    Send alerts.

    Parameters:
    - message (str): The alert message to send.

    This function should be implemented to send alerts
    via email, SMS, or other methods as needed.
    """
    # TODO: Implement the alert sending mechanism
    logger.info(f"Alert: {message}")
    # For example, send an email or push notification
    pass

def get_venv_python(venv_path: str) -> str:
    """Get the Python executable path from the virtual environment."""
    env_builder = venv.EnvBuilder(with_pip=True)
    env_context = env_builder.ensure_directories(venv_path)
    return cast(str, env_context.env_exe)

def is_process_running(pid: int) -> bool:
    """
    Check if a process with the given PID is running.

    Parameters:
    - pid (int): The PID to check.

    Returns:
    - bool: True if the process is running, False otherwise.
    """
    try:
        os.kill(pid, 0)
    except OSError:
        return False
    else:
        return True

def monitor_service(service_name: str, service_config: Dict) -> None:
    """Monitor a single service and restart it if necessary."""
    # Currently disabled - monitoring is handled elsewhere
    return

def monitor_docker_containers() -> None:
    """Monitor Docker containers based on YAML configuration."""
    logger.info("Checking Docker containers...")
    try:
        docker_config = get_service_config('containers')
        result = run_command(docker_config['startup_check']['command'])
        
        if result.state == CommandState.FAILED:
            logger.warning("Docker container check failed")
            stderr_output = '\n'.join(result.stderr)
            if stderr_output:
                logger.warning(f"Docker check error: {stderr_output}")
            return
            
        stdout = '\n'.join(result.stdout)
        if not stdout:
            logger.warning("No output from docker container check")
            return
            
        validation_result = run_command(
            docker_config['startup_check']['validation'],
            input=stdout
        )
        
        if validation_result.state != CommandState.COMPLETED:
            logger.warning("Docker containers are not in the expected state. Restarting...")
            restart_result = run_command(docker_config['script'])
            if restart_result.state != CommandState.COMPLETED:
                logger.error("Failed to restart Docker containers")
                stderr_output = '\n'.join(restart_result.stderr)
                if stderr_output:
                    logger.error(f"Restart error: {stderr_output}")
            else:
                logger.info("Docker containers restarted successfully")
        else:
            logger.debug("Docker containers are in the expected state")
            
    except Exception as e:
        logger.error(f"Error while monitoring Docker containers: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        send_alert(f"Error while monitoring Docker containers: {e}")

def kill_process(pid: int, process_name: str, timeout: int = 10) -> bool:
    """
    Kill a process with escalating force.
    
    Args:
        pid: Process ID to kill
        process_name: Name for logging
        timeout: How long to wait for SIGTERM before using SIGKILL
        
    Returns:
        bool: True if process was killed, False if it couldn't be killed
    """
    try:
        # First try SIGTERM
        logger.info(f"Sending SIGTERM to {process_name} (PID: {pid})")
        os.kill(pid, signal.SIGTERM)
        
        # Wait for the process to terminate
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                # Check if process is still running
                os.kill(pid, 0)
                time.sleep(1.5)
            except ProcessLookupError:
                logger.info(f"{process_name} (PID: {pid}) terminated gracefully")
                return True
                
        # Process didn't terminate, use SIGKILL
        logger.warning(f"{process_name} (PID: {pid}) didn't respond to SIGTERM after {timeout}s, using SIGKILL")
        os.kill(pid, signal.SIGKILL)
        
        # Verify process is gone
        time.sleep(1.5)
        try:
            os.kill(pid, 0)
            logger.error(f"Failed to kill {process_name} (PID: {pid}) even with SIGKILL")
            return False
        except ProcessLookupError:
            logger.info(f"{process_name} (PID: {pid}) terminated with SIGKILL")
            return True
            
    except ProcessLookupError:
        logger.debug(f"{process_name} (PID: {pid}) was already terminated")
        return True
    except Exception as e:
        logger.error(f"Error killing {process_name} (PID: {pid}): {e}")
        return False

def kill_orphans() -> None:
    """Kill orphan processes based on YAML configuration."""
    logger.info("Checking for orphan processes...")
    try:
        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")

        orphan_cfg = config.get('orphan_processes', {})
        processes = orphan_cfg.get('processes', [])
        if not isinstance(processes, list):
            return

        for process_config in processes:
            if not isinstance(process_config, dict):
                continue
            process_identifiers = process_config['name']
            if isinstance(process_identifiers, str):
                process_identifiers = [process_identifiers]
                
            for identifier in process_identifiers:
                # Try exact match first
                pids = get_pids_by_name(identifier)
                
                # If no matches, try full command line
                if not pids:
                    pids = get_pids_by_name(identifier, full_cmdline=True)
                
                killed_count = 0
                failed_count = 0
                
                if not pids:
                    logger.debug(f"No orphan processes found for {identifier}")
                    continue
                    
                logger.info(f"Found {len(pids)} orphan process(es) for {identifier}")
                
                for pid in pids:
                    if kill_process(pid, f"{identifier} orphan"):
                        killed_count += 1
                    else:
                        failed_count += 1
                
                if killed_count:
                    logger.info(f"Successfully killed {killed_count} orphan {identifier} process(es)")
                if failed_count:
                    logger.error(f"Failed to kill {failed_count} orphan {identifier} process(es)")
                    send_alert(f"Failed to kill {failed_count} orphan {identifier} process(es)")
                    
    except Exception as e:
        logger.error(f"Error while killing orphans: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        send_alert(f"Error while killing orphans: {e}")

def monitor_services():
    while True:
        services_section = get_services_section()
        for service_name, service_config in services_section.items():
            if service_config.get('enabled'):
                monitor_service(service_name, service_config)
        
        #no
        #monitor_docker_containers()
        #no
        #kill_orphans()
        
        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")
        monitoring_cfg = config.get('monitoring', {})
        interval_value = monitoring_cfg.get('interval', 60)
        try:
            interval = float(interval_value)
        except (TypeError, ValueError):
            interval = 60.0

        time.sleep(interval)

def get_service_status() -> Dict[str, str]:
    """Get the status of all services."""
    logger.debug("Getting service status...")
    services_status: Dict[str, str] = {}
    try:
        # Check if daemon is running - this affects how we interpret service status
        daemon_running = False
        if os.path.exists("/tmp/kamiwazad.pid"):
            try:
                with open("/tmp/kamiwazad.pid", 'r') as f:
                    pid = int(f.read().strip())
                daemon_running = is_process_running(pid)
            except Exception:
                daemon_running = False
        
        # Check actual container status first
        docker_status = check_docker_status()

        # Determine if we're on a head or worker node
        is_worker = is_worker_node()
        
        # Then process all services
        services_section = get_services_section()
        for service_name, service_config in services_section.items():
            # Skip the container service that doesn't apply to this node type
            if service_name == 'containers' and is_worker:
                services_status[service_name] = 'not-applicable'
                continue
            if service_name == 'containers-worker' and not is_worker:
                services_status[service_name] = 'not-applicable'
                continue

            if not is_service_enabled(service_config):
                services_status[service_name] = 'disabled'
                continue

            # Handle the appropriate container service
            if service_name in ['containers', 'containers-worker']:
                # Apply daemon-aware logic to containers too
                if not daemon_running and docker_status == 'error':
                    # When daemon is down, containers showing 'error' are likely just stopped
                    services_status[service_name] = 'stopped'
                else:
                    services_status[service_name] = docker_status
                continue

            status = check_service_status(service_name, service_config)
            
            # If daemon is not running and service status is 'stopped', 
            # that's expected behavior, not an error
            if not daemon_running and status == 'stopped':
                services_status[service_name] = 'stopped'
            elif not daemon_running and status == 'error':
                # Even errors become 'stopped' when daemon is down since that's expected
                services_status[service_name] = 'stopped'
            else:
                services_status[service_name] = status

    except Exception as e:
        logger.error(f"Error getting service status: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        send_alert(f"Error getting service status: {e}")

    return services_status

def _get_daemon_status():
    """Get daemon status information."""
    daemon_running = os.path.exists("/tmp/kamiwazad.pid")
    daemon_info = ""
    
    if daemon_running:
        try:
            with open("/tmp/kamiwazad.pid", 'r') as f:
                pid = int(f.read().strip())
            daemon_running = is_process_running(pid)
            daemon_info = f"(PID: {pid})"
        except Exception:
            daemon_running = False
            daemon_info = ""
    
    return daemon_running, daemon_info

def _get_status_symbol_and_color(status):
    """Get the symbol and color for a given status."""
    status_map = {
        'running': ("✓", "32", "RUNNING"),     # Green
        'starting': ("⚡", "33", "STARTING"),   # Yellow (wide symbol)
        'stopped': ("●", "37", "STOPPED"),     # White/Gray - neutral, not an error
        'error': ("✗", "31", "ERROR"),         # Red
        'disabled': ("○", "90", "DISABLED"),   # Gray
        'not-applicable': ("—", "90", "N/A")   # Gray
    }
    return status_map.get(status, ("?", "37", status.upper()))

def _group_services_by_status(services_status):
    """Group services by their status for organized display."""
    groups = {
        'running': [],
        'starting': [],
        'stopped': [],
        'error': [],
        'other': []
    }
    
    for service, status in services_status.items():
        service_entry = (service, status)
        if status in groups:
            groups[status].append(service_entry)
        else:
            groups['other'].append(service_entry)
    
    # Return in priority order: errors first, then starting, stopped, running, others
    return groups['error'] + groups['starting'] + groups['stopped'] + groups['running'] + groups['other']

def _format_service_name(service_name):
    """Format service name for display."""
    display_name = service_name.replace('_', ' ').title()
    
    # Handle special cases
    name_mappings = {
        'Containers Worker': 'Worker Containers',
        'Azure Disk Mount': 'Azure Storage',
        'Remote Support': 'Remote Support'
    }
    
    return name_mappings.get(display_name, display_name)

def _count_services_by_status(services_status):
    """Count services by their status."""
    total_services = len([s for s in services_status.values() if s not in ['disabled', 'not-applicable']])
    running_count = len([s for s in services_status.values() if s == 'running'])
    starting_count = len([s for s in services_status.values() if s == 'starting'])
    stopped_count = len([s for s in services_status.values() if s == 'stopped'])
    error_count = len([s for s in services_status.values() if s == 'error'])
    
    return total_services, running_count, starting_count, stopped_count, error_count

def _determine_overall_status(daemon_running, total_services, running_count, starting_count, stopped_count, error_count):
    """Determine overall system status and color."""
    if not daemon_running and stopped_count > 0:
        # When daemon is down, prioritize "STOPPED" status over errors
        return "STOPPED", "37"  # White/Gray
    elif error_count > 0:
        return "DEGRADED", "31"  # Red
    elif starting_count > 0:
        return "STARTING", "33"  # Yellow
    elif running_count == total_services:
        return "HEALTHY", "32"  # Green
    else:
        return "UNKNOWN", "37"  # White

def _print_json_status(services_status):
    """Print status in JSON format."""
    daemon_running = False
    if os.path.exists("/tmp/kamiwazad.pid"):
        try:
            with open("/tmp/kamiwazad.pid", 'r') as f:
                pid = int(f.read().strip())
            daemon_running = is_process_running(pid)
        except Exception:
            daemon_running = False
    
    status_data = {
        "daemon": {"running": daemon_running},
        "services": services_status,
        "timestamp": time.time()
    }
    print(json.dumps(status_data, indent=2))

def _print_human_readable_status(services_status):
    """Print status in human-readable format."""
    import sys
    
    # Check if output is to a terminal for color support
    use_colors = sys.stdout.isatty()
    
    def colorize(text, color_code):
        if use_colors:
            return f"\033[{color_code}m{text}\033[0m"
        return text
    
    # Header
    print()
    print(colorize("═" * 60, "36"))  # Cyan border
    print(colorize("  KAMIWAZA SERVICE STATUS", "36;1"))  # Cyan bold
    print(colorize("═" * 60, "36"))
    print()
    
    # Daemon status
    daemon_running, daemon_info = _get_daemon_status()
    daemon_symbol, daemon_color, _ = _get_status_symbol_and_color('running' if daemon_running else 'stopped')
    daemon_status = "RUNNING" if daemon_running else "STOPPED"
    
    print(f"  {colorize(daemon_symbol, daemon_color)} Daemon: {colorize(daemon_status, daemon_color)} {daemon_info}")
    print()
    
    # Services header
    print(colorize("  Services:", "37;1"))  # White bold
    print()
    
    # Display services
    all_services = _group_services_by_status(services_status)
    
    for service, status in all_services:
        symbol, color_code, display_status = _get_status_symbol_and_color(status)
        display_name = _format_service_name(service)
        
        # Format with consistent spacing - adjust for wide symbols
        if status == 'starting':
            # Lightning bolt is a wide symbol, needs different alignment
            print(f"    {colorize(symbol, color_code)}{display_name:<20} {colorize(display_status, color_code)}")
        else:
            print(f"    {colorize(symbol, color_code)} {display_name:<20} {colorize(display_status, color_code)}")
    
    print()
    
    # Summary footer
    total_services, running_count, starting_count, stopped_count, error_count = _count_services_by_status(services_status)
    summary_status, summary_color = _determine_overall_status(
        daemon_running, total_services, running_count, starting_count, stopped_count, error_count
    )
    
    print(colorize("─" * 60, "90"))  # Gray line
    
    if not daemon_running and stopped_count > 0:
        print(f"  Overall Status: {colorize(summary_status, summary_color)} ({running_count}/{total_services} services running)")
        print(f"  {colorize('●', '37')} {stopped_count} service(s) stopped (daemon not running)")
        if error_count > 0:
            print(f"  {colorize('✗', '31')} {error_count} service(s) with errors (independent of daemon)")
    else:
        print(f"  Overall Status: {colorize(summary_status, summary_color)} ({running_count}/{total_services} services running)")
        if starting_count > 0:
            print(f"  {colorize('⚡', '33')} {starting_count} service(s) starting up")
        if stopped_count > 0:
            print(f"  {colorize('●', '37')} {stopped_count} service(s) stopped")
        if error_count > 0:
            print(f"  {colorize('✗', '31')} {error_count} service(s) with errors")
    print()
    
    # Helpful tips
    if error_count > 0 or starting_count > 0 or (not daemon_running and stopped_count > 0):
        print(colorize("  Tips:", "37;1"))
        if not daemon_running and stopped_count > 0:
            print("    • Use 'start' to start the Kamiwaza daemon and services")
            print("    • Use 'doctor' to check system health before starting")
        elif starting_count > 0:
            print("    • Use 'status -w' to watch startup progress")
        if error_count > 0:
            print("    • Use 'status -l' to view recent logs")
            print("    • Use 'doctor' for diagnostic information")
        print("    • Use 'status -j' for machine-readable JSON output")
        print()

def print_kamiwazad_status(json_output=False):
    """Print the status of kamiwazad and all managed services."""
    services_status = get_service_status()
    
    if json_output:
        _print_json_status(services_status)
    else:
        _print_human_readable_status(services_status)

def handle_sigterm(signum, frame):
    """
    Handle SIGTERM signal for graceful shutdown.
    """
    _ = signum, frame  # Suppress unused variable warnings
    logger.info("Received SIGTERM signal. Shutting down services...")
    stop_all_services(is_shutdown=True)
    sys.exit(0)

def handle_sighup(signum, frame):
    """
    Handle SIGHUP signal for configuration reload.
    """
    _ = signum, frame  # Suppress unused variable warnings
    logger.info("Received SIGHUP signal. Reloading configuration...")
    try:
        # Reload the configuration
        global config
        config_path = os.path.join(KAMIWAZA_ROOT, 'startup/kamiwaza.yml')
        with open(config_path, 'r', encoding='utf-8') as f:
            new_config = yaml.safe_load(f)
            
        # Expand environment variables in the new config
        new_config = expand_env_vars(new_config)
        
        # Update the global config
        config = new_config
        
        logger.info("Configuration reloaded successfully")
        
        # Optionally restart services that have configuration changes
        # This could be implemented later if needed
        
    except Exception as e:
        logger.error(f"Error reloading configuration: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        send_alert(f"Error reloading configuration: {e}")

def start_kamiwazad():
    """Start the kamiwazad service."""
    logger.info("Starting kamiwazad...")
    
    # Change to the base directory
    os.chdir(config['base_directory'])
    
    # Handle SIGTERM for graceful shutdown
    signal.signal(signal.SIGTERM, handle_sigterm)

    # Start the services
    start_services()

    # Run the orphan killer once at startup
    kill_orphans()

    # Start the monitoring in a separate thread
    monitor_thread = threading.Thread(target=monitor_services, daemon=True)
    monitor_thread.start()

    # Keep the main thread alive
    while True:
        time.sleep(1)

def stop_kamiwazad():
    """Stop the kamiwazad service and all managed services."""
    logger.debug("Stopping kamiwazad and all managed services...")
    print("Attempting to stop services...")
    try:
        config_path = os.path.join(KAMIWAZA_ROOT, 'startup/kamiwaza.yml')
        with open(config_path, 'r', encoding='utf-8') as f:
            current_config = yaml.safe_load(f)
            current_config = expand_env_vars(current_config)
            
        global config
        config = current_config
        
        success = stop_all_services(is_shutdown=True)
        if not success:
            print("\nWarning: Some services may not have stopped cleanly")
        else:
            print("\nAll services stopped successfully")
                
    except Exception as e:
        logger.error(f"Error during shutdown: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        print(f"Error during shutdown: {e}")
    finally:
        # Shutdown OTEL and metrics before exit to prevent connection errors
        if otel_available:
            try:
                otel_config.shutdown()
            except Exception as e:
                logger.debug(f"Error shutting down OTEL: {e}")

        if metrics_available:
            try:
                metrics_config.shutdown()
            except Exception as e:
                logger.debug(f"Error shutting down metrics: {e}")

        logger.debug("Stop operation completed")
        print("Stop operation completed")

def start_missing_services() -> None:
    """Start only services that are currently not running."""
    logger.info("Checking for missing services...")
    try:
        if config is None:
            raise RuntimeError("Runtime configuration has not been loaded")

        current_status = get_service_status()
        services_section = get_services_section()
        startup_order_raw = config.get('startup_order', [])
        if not isinstance(startup_order_raw, list):
            raise TypeError("Configuration startup_order must be a list")

        # Determine missing services in startup order (stopped or error)
        missing_services: List[str] = []
        for service_name in startup_order_raw:
            service_config = services_section.get(service_name)
            if service_config is None:
                continue
            if not is_service_enabled(service_config):
                continue
            status = current_status.get(service_name)
            if status in ['stopped', 'error']:
                missing_services.append(service_name)

        # Print a concise summary for interactive users
        if missing_services:
            print(f"Starting missing service(s): {', '.join(missing_services)}")
        else:
            print("No missing services to start")

        for service_name in startup_order_raw:
            service_config = services_section.get(service_name)
            if service_config is None:
                continue
            
            # Skip if service is disabled or not applicable
            if not is_service_enabled(service_config):
                continue
                
            status = current_status.get(service_name)
            if status in ['stopped', 'error']:
                logger.info(f"Attempting to start {service_name} (current status: {status})")
                print(f"→ {service_name}: starting (was {status})")
                
                # Check dependencies first
                if 'dependencies' in service_config:
                    for dep in service_config['dependencies']:
                        dep_status = current_status.get(dep)
                        if dep_status in ['stopped', 'error']:
                            logger.info(f"Starting dependency {dep} first")
                            print(f"   ↳ starting dependency: {dep} (was {dep_status})")
                            dependency_config = services_section.get(dep)
                            if dependency_config is None:
                                logger.error(f"Missing configuration for dependency {dep}; skipping {service_name}")
                                print(f"   ✗ missing dependency config: {dep}; skipping {service_name}")
                                continue
                            if not start_service(dependency_config, dep):
                                logger.error(f"Failed to start dependency {dep}, skipping {service_name}")
                                print(f"   ✗ failed to start dependency: {dep}; skipping {service_name}")
                                continue
                
                if start_service(service_config, service_name):
                    logger.info(f"Successfully started {service_name}")
                    print(f"✓ {service_name}: started")
                else:
                    logger.error(f"Failed to start {service_name}")
                    print(f"✗ {service_name}: failed to start")
            else:
                logger.debug(f"Skipping {service_name} (status: {status})")
                
    except Exception as e:
        logger.error(f"Error starting missing services: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")


def _parse_cli_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Kamiwazad service manager")
    parser.add_argument('command', choices=['start', 'stop', 'status'], help="Command to execute")
    parser.add_argument('-a', '--all', action='store_true', help="Check all service statuses even if daemon is down")
    parser.add_argument('--missing', action='store_true', help="Start only missing/stopped services")
    parser.add_argument('--json', action='store_true', help="Output status in JSON format")
    return parser.parse_args()


def _require_config() -> Dict[str, Any]:
    if config is None:
        raise RuntimeError(
            "Configuration not loaded - KAMIWAZA_ROOT may not be set or kamiwaza.yml may not be accessible"
        )
    return config


def initialize_main_logging_and_metrics() -> Tuple[Optional[Any], Optional[Any]]:
    global logger
    try:
        enable_syslog = should_enable_syslog()
        logger = setup_logging(
            service_name="kamiwazad",
            component="main",
            enable_syslog=enable_syslog,
            force_reconfigure=True,
        )
        logger.debug("Main function configured with modern structured logging")
    except Exception as e:
        import sys
        import traceback
        print("CRITICAL: Main function logging configuration failed.", file=sys.stderr)
        print(f"Error: {e}", file=sys.stderr)
        print("Traceback:", file=sys.stderr)
        traceback.print_exc(file=sys.stderr)
        print("\nCannot continue without logging. Check configuration and environment.", file=sys.stderr)
        sys.exit(1)

    tracer_local: Optional[Any] = None
    metrics_helper_local: Optional[Any] = None

    if otel_available:
        try:
            otel_config.initialize()
            tracer_local = otel_config.tracer
        except Exception as exc:
            logger.warning(f"Failed to initialize OTEL in kamiwazad: {exc}")

    if metrics_available:
        try:
            metrics_config.initialize()
            metrics_helper_local = metrics_helper
        except Exception as exc:
            logger.warning(f"Failed to initialize metrics in kamiwazad: {exc}")

    if exception_handler_available:
        try:
            install_exception_handlers(service_name="kamiwazad", component="main")
            logger.debug(
                "Top-level exception handlers installed",
                extra={
                    "event_type": "service_start",
                    "service": "kamiwazad",
                    "exception_handlers": True,
                    "daemon_phase": "initialization",
                },
            )
        except Exception as exc:
            logger.warning(f"Failed to install exception handlers: {exc}")

    if modern_logging_available:
        from kamiwaza.lib.logging.utils import get_log_directory

        logger.debug(
            "Modern logging configured",
            extra={
                "event_type": "service_start",
                "service": "kamiwazad",
                "log_directory": get_log_directory(),
                "daemon_phase": "initialization",
            },
        )
    else:
        logger.error("Modern logging unavailable")

    return tracer_local, metrics_helper_local



def handle_start_command(
    args: argparse.Namespace,
    tracer: Optional[Any],
    metrics_helper_local: Optional[Any],
) -> None:
    _require_config()

    span = None
    start_time = datetime.now(timezone.utc) if otel_available else None

    if tracer:
        span = tracer.start_span("daemon.start")
        span.set_attribute("kamiwaza.operation", "daemon_start")
        span.set_attribute("kamiwaza.component", "daemon")
        span.set_attribute("daemon.command", "start")
        span.set_attribute("daemon.missing_only", args.missing)
        span.set_attribute("daemon.pid", os.getpid())

    logger.info(
        "Daemon start initiated",
        extra={
            "event_type": "service_start",
            "service": "kamiwazad",
            "pid": os.getpid(),
            "missing_only": args.missing,
            "daemon_phase": "start",
        },
    )

    if secretary_logger_available:
        get_logger().log_event(EventName.APP_START, send_to_telemetry=True)

    try:
        if args.missing:
            pid_file = "/tmp/kamiwazad.pid"
            if os.path.exists(pid_file):
                try:
                    with open(pid_file, 'r') as file:
                        existing_pid = int(file.read().strip())
                    if is_process_running(existing_pid):
                        logger.info("Daemon already running; starting missing services (one-shot) and exiting")
                        start_missing_services()
                        return
                except Exception:
                    logger.debug("Unable to verify running daemon PID; continuing with normal start flow")

        if args.missing:
            start_missing_services()
        else:
            start_services()

        threading.Thread(target=monitor_services, daemon=True).start()
        signal.signal(signal.SIGTERM, handle_sigterm)
        signal.signal(signal.SIGHUP, handle_sighup)

        duration_ms = (datetime.now(timezone.utc) - start_time).total_seconds() * 1000 if start_time else 0
        logger.info(
            "Daemon started successfully",
            extra={
                "event_category": "audit",
                "is_audit_event": True,
                "event_type": "service_start",
                "service": "kamiwazad",
                "resource": "daemon/kamiwazad",
                "action": "start",
                "result": "success",
                "pid": os.getpid(),
                "missing_only": args.missing,
                "duration_ms": duration_ms,
                "daemon_phase": "running",
            },
        )

        if metrics_helper_local:
            try:
                metrics_helper_local.record_counter("kamiwaza.daemon.starts.total", 1, {"status": "success"})
                metrics_helper_local.record_histogram(
                    "kamiwaza.daemon.startup.duration.seconds",
                    duration_ms / 1000.0,
                    {},
                )
            except Exception as exc:
                logger.warning(f"Failed to record daemon start metrics: {exc}")

        if span:
            span.set_attribute("daemon.startup_duration_ms", duration_ms)
            span.set_attribute("daemon.services_started", len(service_states))
            span.set_status(Status(StatusCode.OK))
            span.end()

        try:
            sleep_time = 0
            while True:
                time.sleep(1)
                sleep_time += 1
                if sleep_time == 10 and os.environ.get('KAMIWAZAD_IS_WORKER', '999') == '0':
                    try:
                        os.remove('/tmp/kamiwazad.starting')
                    except FileNotFoundError:
                        pass
        except KeyboardInterrupt:
            logger.info(
                "Received keyboard interrupt",
                extra={
                    "event_category": "audit",
                    "is_audit_event": True,
                    "event_type": "service_stop",
                    "service": "kamiwazad",
                    "resource": "daemon/kamiwazad",
                    "action": "stop",
                    "result": "success",
                    "reason": "keyboard_interrupt",
                    "daemon_phase": "shutdown",
                },
            )

            if metrics_helper_local:
                try:
                    metrics_helper_local.record_counter(
                        "kamiwaza.daemon.stops.total",
                        1,
                        {"reason": "interrupt"},
                    )
                except Exception:
                    pass

            stop_kamiwazad()

    except Exception as exc:  # pylint: disable=broad-except
        logger.error(
            "Daemon startup failed",
            extra={
                "event_category": "audit",
                "is_audit_event": True,
                "event_type": "service_initialization_failed",
                "service": "kamiwazad",
                "resource": "daemon/kamiwazad",
                "action": "start",
                "result": "error",
                "error_type": type(exc).__name__,
                "error_message": str(exc),
                "daemon_phase": "error",
            },
            exc_info=True,
        )

        if metrics_helper_local:
            try:
                metrics_helper_local.record_counter(
                    "kamiwaza.daemon.starts.total",
                    1,
                    {"status": "error"},
                )
            except Exception:
                pass

        if span:
            span.record_exception(exc)
            span.set_status(Status(StatusCode.ERROR, str(exc)))
            span.set_attribute("error", True)
            span.set_attribute("error.type", type(exc).__name__)
            span.end()

        stop_kamiwazad()


def handle_stop_command(args: argparse.Namespace, tracer: Optional[Any], metrics_helper_local: Optional[Any]) -> None:
    _require_config()

    span = None
    stop_start_time = datetime.now(timezone.utc) if otel_available else None

    if tracer:
        span = tracer.start_span("daemon.stop")
        span.set_attribute("kamiwaza.operation", "daemon_stop")
        span.set_attribute("kamiwaza.component", "daemon")
        span.set_attribute("daemon.command", "stop")

    logger.debug(
        "Daemon stop initiated",
        extra={"event_type": "service_stop", "service": "kamiwazad", "daemon_phase": "shutdown"},
    )

    if secretary_logger_available:
        get_logger().log_event(EventName.APP_SHUTDOWN, send_to_telemetry=True)
        kamiwaza_logger_instance.close()

    try:
        stop_kamiwazad()

        stop_duration_ms = (datetime.now(timezone.utc) - stop_start_time).total_seconds() * 1000 if stop_start_time else 0
        logger.debug(
            "Daemon stopped successfully",
            extra={
                "event_category": "audit",
                "is_audit_event": True,
                "event_type": "service_stop",
                "service": "kamiwazad",
                "resource": "daemon/kamiwazad",
                "action": "stop",
                "result": "success",
                "reason": "graceful",
                "duration_ms": stop_duration_ms,
                "daemon_phase": "stopped",
            },
        )

        if metrics_helper_local:
            try:
                metrics_helper_local.record_counter(
                    "kamiwaza.daemon.stops.total",
                    1,
                    {"reason": "graceful"},
                )
            except Exception:
                pass

        if span:
            span.set_attribute("daemon.stop_duration_ms", stop_duration_ms)
            span.set_status(Status(StatusCode.OK))
            span.end()

    except Exception as exc:  # pylint: disable=broad-except
        logger.error(
            "Daemon stop failed",
            extra={
                "event_category": "audit",
                "is_audit_event": True,
                "event_type": "service_stop",
                "service": "kamiwazad",
                "resource": "daemon/kamiwazad",
                "action": "stop",
                "result": "error",
                "error_type": type(exc).__name__,
                "error_message": str(exc),
                "daemon_phase": "error",
            },
            exc_info=True,
        )

        if metrics_helper_local:
            try:
                metrics_helper_local.record_counter(
                    "kamiwaza.daemon.stops.total",
                    1,
                    {"reason": "error"},
                )
            except Exception:
                pass

        raise


def handle_status_command(args: argparse.Namespace) -> None:
    if args.all:
        print_kamiwazad_status(json_output=args.json)
        return

    pid_file = "/tmp/kamiwazad.pid"
    if os.path.exists(pid_file):
        with open(pid_file, 'r') as file:
            pid = int(file.read().strip())
        if is_process_running(pid):
            print_kamiwazad_status(json_output=args.json)
            return

    if args.json:
        print(json.dumps({"daemon": {"running": False}, "services": {}, "timestamp": time.time()}))
    else:
        print("kamiwazad is not running")


def main() -> None:
    args = _parse_cli_args()
    _require_config()
    tracer, metrics_helper_local = initialize_main_logging_and_metrics()

    logger.debug(
        "Daemon command received",
        extra={"event_type": "service_start", "service": "kamiwazad", "command": args.command, "daemon_phase": "initialization"},
    )

    if args.command == 'start':
        handle_start_command(args, tracer, metrics_helper_local)
    elif args.command == 'stop':
        handle_stop_command(args, tracer, metrics_helper_local)
    else:
        handle_status_command(args)


if __name__ == '__main__':
    main()
