#!/usr/bin/env bash
# pearl_mine — clean (re)start of the pure-GEMM Pearl miner + Grafana Alloy.
#
#   ./pearl_mine            clean restart, then follow the log (DEFAULT)
#   ./pearl_mine start      same as above
#   ./pearl_mine stop       TOTALLY stop scanner + alloy
#   ./pearl_mine status     one-shot health check (non-blocking)
#   ./pearl_mine logs       follow the live log (Ctrl-C stops watching only)
#
# Services start in their own session (setsid), so following the log and
# pressing Ctrl-C — or closing the tmux pane — does NOT stop mining.
# Only "./pearl_mine stop" stops the miner.
set -uo pipefail

REPO=/workspace/pearl
PYTHON=$REPO/.venv/bin/python

# ── Mining config (reward-optimized 2026-06-10; see project_puregemm_reward_law) ──
# Reward ∝ m·n·k (useful INT8 MAC/s), NOT tiles/s. This (16384² , k=16384, r=128)
# measured 660 eff-TOPS = +60% vs the old 8192²/k=2048/r=64 default, at the same 400W
# cap (~18GB VRAM). tiles/s LOOKS ~5× lower than the old default — expected & correct.
# Override any of these via env if needed.
export SCANNER_MATRIX_M="${SCANNER_MATRIX_M:-16384}"
export SCANNER_MATRIX_N="${SCANNER_MATRIX_N:-16384}"
export SCANNER_MATRIX_K="${SCANNER_MATRIX_K:-16384}"
export SCANNER_NOISE_RANK="${SCANNER_NOISE_RANK:-128}"

SCANNER_LOG=/tmp/scanner.log
ALLOY_LOG=/tmp/alloy.log
ALLOY_BIN=/usr/bin/alloy
ALLOY_CFG=/etc/alloy/config.alloy
ALLOY_DATA=/var/lib/alloy/data
KEY_FILE=/etc/systemd/system/alloy.service.d/env.conf
METRICS_PORT=9101

check_venv() {
  # Refuse to touch the running miner if the venv can't actually import the
  # native deps (e.g. .venv was rsynced from a Mac → wrong-platform .so files).
  if ! "$PYTHON" -c "import pearl_mining, torch, pure_gemm_scanner" >/dev/null 2>&1; then
    echo "[pearl_mine] ABORT: venv is broken — $PYTHON cannot import pearl_mining/torch/pure_gemm_scanner."
    echo "[pearl_mine] Leaving the running miner untouched. Rebuild first:"
    echo "[pearl_mine]   cd $REPO && uv sync --all-packages --reinstall"
    echo "[pearl_mine] (and never rsync .venv between Mac and this host)."
    exit 1
  fi
}

stop() {
  echo "[pearl_mine] stopping miner + alloy ..."
  pkill -9 -f pure_gemm_scanner  2>/dev/null && echo "  scanner : killed" || echo "  scanner : (none running)"
  pkill -9 -f "${ALLOY_BIN} run" 2>/dev/null && echo "  alloy   : killed" || echo "  alloy   : (none running)"
  sleep 2
  if ss -tln 2>/dev/null | grep -q ":${METRICS_PORT} "; then
    echo "  WARN: port ${METRICS_PORT} still bound — something may be wedged"
  fi
  echo "[pearl_mine] stopped."
}

start_alloy() {
  if [ ! -x "$ALLOY_BIN" ]; then
    echo "[pearl_mine] alloy not installed — skipping metrics (miner still runs)"
    return
  fi
  local key
  key=$(grep -oP '(?<=GCLOUD_RW_API_KEY=)[^"]+' "$KEY_FILE" 2>/dev/null || true)
  [ -z "$key" ] && echo "[pearl_mine] WARN: no GCLOUD_RW_API_KEY in $KEY_FILE — alloy won't push to Grafana"
  echo "[pearl_mine] starting alloy   -> $ALLOY_LOG"
  GCLOUD_RW_API_KEY="$key" setsid "$ALLOY_BIN" run --storage.path="$ALLOY_DATA" "$ALLOY_CFG" \
    </dev/null >"$ALLOY_LOG" 2>&1 &
}

start_scanner() {
  [ -x "$PYTHON" ] || { echo "[pearl_mine] ERROR: venv python not at $PYTHON"; exit 1; }
  echo "[pearl_mine] starting miner   -> $SCANNER_LOG"
  setsid bash -c "cd '$REPO'; PYTHONUNBUFFERED=1 '$PYTHON' -m pure_gemm_scanner > '$SCANNER_LOG' 2>&1" \
    </dev/null >/dev/null 2>&1 &
}

status() {
  echo "──────────────── pearl_mine status ────────────────"
  pgrep -f 'python -m pure_gemm_scanner' >/dev/null && echo "scanner : RUNNING" || echo "scanner : STOPPED"
  pgrep -f "${ALLOY_BIN} run"            >/dev/null && echo "alloy   : RUNNING" || echo "alloy   : STOPPED"
  ss -tln 2>/dev/null | grep -q ":${METRICS_PORT} " && echo "metrics : :${METRICS_PORT} up" || echo "metrics : :${METRICS_PORT} DOWN"
  command -v nvidia-smi >/dev/null 2>&1 && \
    echo "gpu     : $(nvidia-smi --query-gpu=utilization.gpu,power.draw,memory.used --format=csv,noheader 2>/dev/null)"
  tail -n 1 "$SCANNER_LOG" 2>/dev/null | sed 's/^/last    : /'
  echo "───────────────────────────────────────────────────"
}

case "${1:-restart}" in
  start|restart)
    check_venv
    stop
    start_alloy
    start_scanner
    echo "[pearl_mine] warming up ..."
    sleep 8
    status
    echo "[pearl_mine] following log — Ctrl-C stops WATCHING only (miner keeps running)."
    echo "[pearl_mine] to actually stop mining:  ./pearl_mine stop"
    echo "────────────────────────────────────────────────────────────────────"
    exec tail -n 5 -f "$SCANNER_LOG"
    ;;
  stop)    stop ;;
  status)  status ;;
  logs)    exec tail -n 30 -f "$SCANNER_LOG" ;;
  *) echo "usage: $0 [start|stop|status|logs|restart]"; exit 1 ;;
esac
