#!/usr/bin/env bash
set -euo pipefail

# Cluster maintenance helper for Patroni + etcd + HAProxy deployments.
#
# Safe patterns:
# - PostgreSQL replicas: add new replica, let it sync, then remove old replica.
# - etcd members: when replacing a permanently removed/unhealthy member in a 3-node cluster,
#   remove old member first, then add the new member.
#
# Examples:
#   ./cluster-node.sh add-replica db4 10.20.0.14 root@db4.example.com
#   ./cluster-node.sh remove-replica db4 root@db4.example.com
#   ./cluster-node.sh switchover db2
#   ./cluster-node.sh replace-etcd-member db3 db4 10.20.0.14
#   ./cluster-node.sh list
#   ./cluster-node.sh etcd-members

CLUSTER_NAME="${CLUSTER_NAME:-pg-ha-demo}"
ADMIN_NODE="${ADMIN_NODE:-db1}"
ADMIN_NODE_SSH="${ADMIN_NODE_SSH:-root@db1.example.com}"
ADMIN_SSH_OPTS="${ADMIN_SSH_OPTS:--o BatchMode=yes}"
PATRONI_CONFIG_PATH="${PATRONI_CONFIG_PATH:-/etc/patroni/patroni.yml}"
ETCD_ENDPOINTS="${ETCD_ENDPOINTS:-http://db1:2379,http://db2:2379,http://db3:2379}"
DB_STACK_DIR="${DB_STACK_DIR:-~/db-stack}"
ETCD_CONTAINER_NAME="${ETCD_CONTAINER_NAME:-${ADMIN_NODE}-etcd}"
PATRONI_CONTAINER_NAME="${PATRONI_CONTAINER_NAME:-${ADMIN_NODE}-patroni}"

usage() {
  cat <<USAGE
Usage:
  $0 add-replica <new_name> <new_wg_ip> <new_host_ssh>
  $0 remove-replica <old_name> <old_host_ssh>
  $0 switchover <candidate_name>
  $0 replace-etcd-member <old_name> <new_name> <new_wg_ip>
  $0 list
  $0 etcd-members

Environment overrides:
  CLUSTER_NAME          Patroni cluster scope (default: pg-ha-demo)
  ADMIN_NODE            Existing admin node name (default: db1)
  ADMIN_NODE_SSH        SSH target for admin node (default: root@db1.example.com)
  ADMIN_SSH_OPTS        SSH options (default: -o BatchMode=yes)
  PATRONI_CONFIG_PATH   Patroni config path in container (default: /etc/patroni/patroni.yml)
  ETCD_ENDPOINTS        Comma-separated etcd endpoints
  DB_STACK_DIR          Stack dir on remote DB hosts (default: ~/db-stack)
USAGE
}

run_admin() {
  ssh ${ADMIN_SSH_OPTS} "${ADMIN_NODE_SSH}" "$@"
}

run_remote() {
  local host="$1"
  shift
  ssh ${ADMIN_SSH_OPTS} "${host}" "$@"
}

patroni_list() {
  run_admin "docker exec ${PATRONI_CONTAINER_NAME} patronictl -c ${PATRONI_CONFIG_PATH} list"
}

etcd_members() {
  run_admin "docker exec ${ETCD_CONTAINER_NAME} sh -lc 'ETCDCTL_API=3 etcdctl --endpoints=${ETCD_ENDPOINTS} member list'"
}

get_etcd_member_id_by_name() {
  local name="$1"
  etcd_members | awk -F, -v n="$name" '$3==n {print $1}'
}

wait_for_patroni_member() {
  local member="$1"
  local tries="${2:-60}"
  local delay="${3:-5}"
  echo "Waiting for Patroni member ${member} to appear..."
  for _ in $(seq 1 "$tries"); do
    if patroni_list | grep -Eq "^[[:space:]]*${member}[[:space:]]"; then
      echo "Member ${member} is visible in Patroni."
      return 0
    fi
    sleep "$delay"
  done
  echo "Timed out waiting for Patroni member ${member}." >&2
  return 1
}

wait_for_replica_state() {
  local member="$1"
  local tries="${2:-90}"
  local delay="${3:-10}"
  echo "Waiting for ${member} to be a running replica..."
  for _ in $(seq 1 "$tries"); do
    if patroni_list | grep -E "^[[:space:]]*${member}[[:space:]]" | grep -Eq 'Replica|Standby'; then
      echo "Replica ${member} is present."
      return 0
    fi
    sleep "$delay"
  done
  echo "Timed out waiting for ${member} to become a replica." >&2
  return 1
}

add_replica() {
  local new_name="$1"
  local new_ip="$2"
  local new_host_ssh="$3"

  cat <<INFO
Before continuing, make sure on ${new_host_ssh}:
- WireGuard is already configured and connected with ${new_ip}
- /etc/hosts contains db1, db2, db3 and the new host if needed
- ${DB_STACK_DIR} contains the generated files for the new node
- The new node's .env has NODE_NAME=${new_name} and WG_IP=${new_ip}
INFO

  echo "Bringing up the new replica on ${new_host_ssh}..."
  run_remote "${new_host_ssh}" "cd ${DB_STACK_DIR} && docker compose up -d --build"

  wait_for_patroni_member "${new_name}"
  wait_for_replica_state "${new_name}"

  cat <<INFO
Replica ${new_name} has joined.

Next manual steps:
1. Add this line to HAProxy backend and reload HAProxy:
   server ${new_name} ${new_name}:5432 check port 8008
2. Confirm the new node is caught up before retiring any old node.
INFO
}

remove_replica() {
  local old_name="$1"
  local old_host_ssh="$2"

  echo "Current cluster topology:"
  patroni_list

  echo "Stopping Patroni/PostgreSQL on ${old_name}..."
  run_remote "${old_host_ssh}" "docker stop ${old_name}-patroni || true"

  cat <<INFO
Replica ${old_name} stopped.

Next manual steps:
1. Remove ${old_name} from HAProxy backend and reload HAProxy.
2. If the server is being retired permanently and also runs etcd, handle etcd membership separately.
3. After validation, destroy the old VPS or wipe its data directory.
INFO
}

switchover_primary() {
  local candidate="$1"
  local leader
  echo "Current cluster topology:"
  patroni_list
  leader="$(patroni_list | awk '$0 ~ /Leader/ {print $1; exit}')"
  if [[ -z "$leader" ]]; then
    echo "Could not determine current leader." >&2
    exit 1
  fi

  echo "Switching over from ${leader} to ${candidate}..."
  run_admin "docker exec ${PATRONI_CONTAINER_NAME} patronictl -c ${PATRONI_CONFIG_PATH} switchover ${CLUSTER_NAME} --leader ${leader} --candidate ${candidate} --force"
  echo "Updated topology:"
  patroni_list
}

replace_etcd_member() {
  local old_name="$1"
  local new_name="$2"
  local new_ip="$3"
  local old_member_id

  old_member_id="$(get_etcd_member_id_by_name "$old_name")"
  if [[ -z "$old_member_id" ]]; then
    echo "Could not find etcd member ID for ${old_name}." >&2
    exit 1
  fi

  echo "Removing old etcd member ${old_name} (${old_member_id})..."
  run_admin "docker exec ${ETCD_CONTAINER_NAME} sh -lc 'ETCDCTL_API=3 etcdctl --endpoints=${ETCD_ENDPOINTS} member remove ${old_member_id}'"

  echo "Adding new etcd member ${new_name} with peer URL http://${new_name}:2380 ..."
  run_admin "docker exec ${ETCD_CONTAINER_NAME} sh -lc 'ETCDCTL_API=3 etcdctl --endpoints=${ETCD_ENDPOINTS} member add ${new_name} --peer-urls=http://${new_name}:2380'"

  cat <<INFO
Use the exact output of the etcd member-add command above when bringing up etcd on the new host.
Important points:
- initial-cluster-state should be 'existing'
- the new node's etcd peer URL should match http://${new_name}:2380 (or change this script if you prefer an IP)
- after etcd is up, bring up Patroni/PostgreSQL on the new node
INFO
}

cmd="${1:-}"
case "$cmd" in
  add-replica)
    [[ $# -eq 4 ]] || { usage; exit 1; }
    add_replica "$2" "$3" "$4"
    ;;
  remove-replica)
    [[ $# -eq 3 ]] || { usage; exit 1; }
    remove_replica "$2" "$3"
    ;;
  switchover)
    [[ $# -eq 2 ]] || { usage; exit 1; }
    switchover_primary "$2"
    ;;
  replace-etcd-member)
    [[ $# -eq 4 ]] || { usage; exit 1; }
    replace_etcd_member "$2" "$3" "$4"
    ;;
  list)
    patroni_list
    ;;
  etcd-members)
    etcd_members
    ;;
  *)
    usage
    exit 1
    ;;
esac
