#!/usr/bin/env bash
# pearl-miner — one-command launcher for the PearlDaddy GEMM-transcript miner.
#
# Distribution mode (the published tarball): a `src/` directory with the full
# miner workspace sits next to this script; first run copies it into
# ~/.pearl-miner/src and builds it (CUDA kernel + Rust ext — takes a while,
# cached afterwards). No git access needed.
#
# Dev fallback (no bundled src/): clones the pearl_mining repo instead
# (private — set PEARL_GIT_TOKEN).
#
# Usage:
#   ./pearl-miner --user <prl1...> [--host <ip:port>] [--worker <name>] [--gpus <all|0,1>] [--tls]
#   ./pearl-miner selftest --user <prl1...>     # register + one job, no GPU needed
#
# Env overrides:
#   PEARL_POOL_HOST    pool host:port            (default: 5.78.124.226:9000)
#   PEARL_MINER_HOME   install/cache directory   (default: ~/.pearl-miner)
#   PEARL_REPO_URL     fallback source repo      (default: https://github.com/PearlDaddy/pearl_mining.git)
#   PEARL_BRANCH       fallback branch           (default: pool-client)
#   PEARL_GIT_TOKEN    GitHub token for the fallback clone
set -euo pipefail

ORIG_ARGS=("$@")

DEFAULT_HOST="5.78.124.226:9000"
HOST="${PEARL_POOL_HOST:-$DEFAULT_HOST}"
WALLET=""
WORKER="${HOSTNAME:-rig}"
GPUS="all"
SUBCMD="pool"
TLS=""

REPO_URL="${PEARL_REPO_URL:-https://github.com/PearlDaddy/pearl_mining.git}"
BRANCH="${PEARL_BRANCH:-pool-client}"
HOME_DIR="${PEARL_MINER_HOME:-$HOME/.pearl-miner}"
SRC_DIR="$HOME_DIR/src"

# Some GPU-template images export UV_NO_CACHE=1 / odd UV_CACHE_DIRs, silently
# forcing every install to re-download ~5 GB of wheels and recompile the CUDA
# kernel. A mining rig WANTS the cache — own it under our home dir.
unset UV_NO_CACHE
export UV_CACHE_DIR="$HOME_DIR/uv-cache"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BUNDLED_SRC="$SCRIPT_DIR/src"

SVC_ACTION=""

while [[ $# -gt 0 ]]; do
    case $1 in
        --host)    HOST="$2";   shift 2 ;;
        --user)    WALLET="$2"; shift 2 ;;
        --worker)  WORKER="$2"; shift 2 ;;
        --gpus)    GPUS="$2";   shift 2 ;;
        --tls)     TLS="--tls"; shift ;;
        selftest)  SUBCMD="selftest"; shift ;;
        mine|pool|mine-blocks) SUBCMD="pool"; shift ;;
        service)
            SUBCMD="service"; shift
            case "${1:-}" in
                stop|status|restart|update) SVC_ACTION="$1"; shift ;;
                *)                          SVC_ACTION="install" ;;
            esac ;;
        --help|-h)
            echo "Usage: $0 --user <wallet> [--host <ip:port>] [--worker <name>] [--gpus <indices>] [--tls]"
            echo "       $0 selftest --user <wallet>"
            echo "       $0 service --user <wallet> [...]   # install + start + run on boot"
            echo "       $0 service status|stop|restart|update"
            echo "Default pool: $DEFAULT_HOST (override with --host or PEARL_POOL_HOST)"
            exit 0 ;;
        *) echo "Unknown option: $1"; exit 1 ;;
    esac
done

log() { echo "[pearl-miner] $*"; }
die() { echo "[pearl-miner] ERROR: $*" >&2; exit 1; }

if [[ "$SUBCMD" != "service" || "$SVC_ACTION" == "install" ]]; then
    [[ -z "$WALLET" ]] && die "--user is required (your prl1... payout address)"
    [[ "$WALLET" != prl1* ]] && echo "WARNING: wallet address should start with prl1" >&2
fi

# ── Self-update (before anything heavy) ─────────────────────────────────────
# Compares the pool's published bundle commit against the bundled copy; on a
# mismatch downloads the new tarball, verifies its sha256, swaps src/ and the
# launcher in place, and re-execs once (PEARL_UPDATED reentry guard). Any
# failure → continue on the current version; updates never cost uptime.
self_update() {
    [[ -n "${PEARL_NO_AUTO_UPDATE:-}" || -n "${PEARL_UPDATED:-}" ]] && return 0
    [[ -f "$BUNDLED_SRC/BUNDLE_COMMIT" && -w "$SCRIPT_DIR" ]] || return 0
    local base="http://${HOST%%:*}/downloads"
    local remote local_c
    remote="$(curl -fsS --max-time 10 "$base/pearl-miner-src.commit" 2>/dev/null | tr -d '[:space:]')" || return 0
    [[ "$remote" =~ ^[0-9a-f]{40}$ ]] || return 0
    local_c="$(tr -d '[:space:]' < "$BUNDLED_SRC/BUNDLE_COMMIT")"
    [[ "$remote" == "$local_c" ]] && return 0

    log "newer bundle published (${remote:0:12} != ${local_c:0:12}) — self-updating"
    local tmp; tmp="$(mktemp -d)" || return 0
    if ! curl -fsSL --max-time 300 "$base/pearl-miner-linux-x64.tar.gz" -o "$tmp/bundle.tar.gz"; then
        log "WARNING: update download failed — continuing on current version"; rm -rf "$tmp"; return 0
    fi
    local want got
    want="$(curl -fsS --max-time 10 "$base/pearl-miner-linux-x64.tar.gz.sha256" 2>/dev/null | awk "{print \$1}")"
    got="$(sha256sum "$tmp/bundle.tar.gz" | awk "{print \$1}")"
    if [[ -z "$want" || "$want" != "$got" ]]; then
        log "WARNING: update checksum mismatch — continuing on current version"; rm -rf "$tmp"; return 0
    fi
    if ! tar xzf "$tmp/bundle.tar.gz" -C "$tmp" 2>/dev/null; then
        log "WARNING: update unpack failed — continuing on current version"; rm -rf "$tmp"; return 0
    fi
    rm -rf "$SCRIPT_DIR/src"
    cp -a "$tmp/src" "$SCRIPT_DIR/src"
    cp -f "$tmp/pearl-miner" "$SCRIPT_DIR/$(basename "${BASH_SOURCE[0]}")" 2>/dev/null || true
    chmod +x "$SCRIPT_DIR/$(basename "${BASH_SOURCE[0]}")" 2>/dev/null || true
    rm -rf "$tmp"
    log "updated to ${remote:0:12} — restarting launcher"
    PEARL_UPDATED=1 exec "$SCRIPT_DIR/$(basename "${BASH_SOURCE[0]}")" "${ORIG_ARGS[@]}"
}

# ── Supervised service (install/status/stop/restart) ────────────────────────
# Stages the launcher + bundled src into a stable dir and registers it with
# whatever init the host actually has: systemd (real servers/VMs), supervisord
# (vast.ai-style containers), or a setsid restart-loop as the last resort.
SVC_NAME="pearl-miner"
SVC_DIST="$HOME_DIR/dist"
SVC_LOG="/var/log/pearl-miner.log"
SVC_PIDFILE="$HOME_DIR/service.pid"

svc_mechanism() {
    if [[ -d /run/systemd/system ]] && command -v systemctl >/dev/null 2>&1; then
        echo systemd
    elif command -v supervisorctl >/dev/null 2>&1 && [[ -d /etc/supervisor/conf.d ]]; then
        echo supervisord
    else
        echo loop
    fi
}

svc_install() {
    mkdir -p "$SVC_DIST"
    if [[ "$SCRIPT_DIR" != "$SVC_DIST" ]]; then
        cp -f "$SCRIPT_DIR/$(basename "${BASH_SOURCE[0]}")" "$SVC_DIST/pearl-miner" 2>/dev/null \
            || cp -f "${BASH_SOURCE[0]}" "$SVC_DIST/pearl-miner"
        chmod +x "$SVC_DIST/pearl-miner"
        if [[ -d "$BUNDLED_SRC" ]]; then
            rm -rf "$SVC_DIST/src"
            cp -a "$BUNDLED_SRC" "$SVC_DIST/src"
        fi
    fi
    local args="--user $WALLET --worker $WORKER --gpus $GPUS --host $HOST ${TLS:+--tls}"
    # Init daemons don't read shell profiles — bake the CUDA toolchain location
    # (detected from THIS shell, which passed the nvcc preflight) into the
    # service PATH, or the supervised miner dies on its own preflight.
    local nvcc_dir="/usr/local/cuda/bin"
    command -v nvcc >/dev/null 2>&1 && nvcc_dir="$(dirname "$(command -v nvcc)")"
    local svc_path="$HOME/.local/bin:$HOME/.cargo/bin:$nvcc_dir:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
    local mech; mech="$(svc_mechanism)"
    log "installing service via $mech (log: $SVC_LOG)"

    case "$mech" in
        systemd)
            cat > "/etc/systemd/system/$SVC_NAME.service" <<UNIT
[Unit]
Description=Pearl Miner (GEMM-transcript pool miner)
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
Environment=HOME=$HOME
Environment=PEARL_MINER_HOME=$HOME_DIR
Environment=PATH=$svc_path
ExecStart=$SVC_DIST/pearl-miner $args
Restart=always
RestartSec=15
StandardOutput=append:$SVC_LOG
StandardError=append:$SVC_LOG

[Install]
WantedBy=multi-user.target
UNIT
            systemctl daemon-reload
            systemctl enable --now "$SVC_NAME"
            ;;
        supervisord)
            cat > "/etc/supervisor/conf.d/$SVC_NAME.conf" <<CONF
[program:$SVC_NAME]
command=$SVC_DIST/pearl-miner $args
directory=$SVC_DIST
autostart=true
autorestart=true
startsecs=10
stopwaitsecs=30
redirect_stderr=true
stdout_logfile=$SVC_LOG
stdout_logfile_maxbytes=50MB
environment=HOME="$HOME",PEARL_MINER_HOME="$HOME_DIR",PATH="$svc_path"
CONF
            supervisorctl reread >/dev/null
            supervisorctl update "$SVC_NAME" >/dev/null 2>&1 || supervisorctl update >/dev/null
            supervisorctl restart "$SVC_NAME" >/dev/null 2>&1 || true
            ;;
        loop)
            cat > "$SVC_DIST/run-loop.sh" <<LOOP
#!/usr/bin/env bash
# Auto-restart wrapper (no init system available). Survives disconnects;
# cannot survive a container reboot — re-run 'pearl-miner service' after one.
while true; do
    "$SVC_DIST/pearl-miner" $args
    echo "[pearl-miner-service] miner exited (\$?) — restarting in 15s" >> "$SVC_LOG"
    sleep 15
done
LOOP
            chmod +x "$SVC_DIST/run-loop.sh"
            svc_stop_loop quiet
            setsid nohup "$SVC_DIST/run-loop.sh" >> "$SVC_LOG" 2>&1 < /dev/null &
            echo $! > "$SVC_PIDFILE"
            ;;
    esac
    log "service installed and started — check: $0 service status"
}

svc_stop_loop() {
    if [[ -f "$SVC_PIDFILE" ]]; then
        local pid; pid="$(cat "$SVC_PIDFILE")"
        kill -- -"$pid" 2>/dev/null || kill "$pid" 2>/dev/null || true
        rm -f "$SVC_PIDFILE"
        [[ "${1:-}" == quiet ]] || log "stopped"
    else
        pkill -f "$SVC_DIST/run-loop.sh" 2>/dev/null || true
        [[ "${1:-}" == quiet ]] || log "no pidfile — best-effort stop done"
    fi
    pkill -f "$SVC_DIST/pearl-miner " 2>/dev/null || true
}

svc_action() {
    local mech; mech="$(svc_mechanism)"
    case "$mech:$SVC_ACTION" in
        systemd:status)      systemctl status "$SVC_NAME" --no-pager | head -12; tail -5 "$SVC_LOG" 2>/dev/null ;;
        systemd:stop)        systemctl disable --now "$SVC_NAME" ;;
        systemd:restart)     systemctl restart "$SVC_NAME" ;;
        supervisord:status)  supervisorctl status "$SVC_NAME"; tail -5 "$SVC_LOG" 2>/dev/null ;;
        supervisord:stop)    supervisorctl stop "$SVC_NAME" ;;
        supervisord:restart) supervisorctl restart "$SVC_NAME" ;;
        loop:status)
            if [[ -f "$SVC_PIDFILE" ]] && kill -0 "$(cat "$SVC_PIDFILE")" 2>/dev/null; then
                log "running (loop pid $(cat "$SVC_PIDFILE"))"
            else
                log "not running"
            fi
            tail -5 "$SVC_LOG" 2>/dev/null ;;
        loop:stop)           svc_stop_loop ;;
        systemd:update | supervisord:update | loop:update)
            log "forcing update of $SVC_DIST from the pool"
            local base="http://${HOST%%:*}/downloads"
            local tmp; tmp="$(mktemp -d)"
            curl -fsSL "$base/pearl-miner-linux-x64.tar.gz" -o "$tmp/b.tgz" \
                && tar xzf "$tmp/b.tgz" -C "$tmp" \
                && rm -rf "$SVC_DIST/src" && cp -a "$tmp/src" "$SVC_DIST/src" \
                && cp -f "$tmp/pearl-miner" "$SVC_DIST/pearl-miner" && chmod +x "$SVC_DIST/pearl-miner" \
                || { rm -rf "$tmp"; die "update failed"; }
            rm -rf "$tmp"
            case "$mech" in
                systemd)     systemctl restart "$SVC_NAME" ;;
                supervisord) supervisorctl restart "$SVC_NAME" ;;
                loop)        svc_stop_loop quiet
                             setsid nohup "$SVC_DIST/run-loop.sh" >> "$SVC_LOG" 2>&1 < /dev/null &
                             echo $! > "$SVC_PIDFILE" ;;
            esac
            log "updated and restarted" ;;
        loop:restart)        svc_stop_loop quiet
                             setsid nohup "$SVC_DIST/run-loop.sh" >> "$SVC_LOG" 2>&1 < /dev/null &
                             echo $! > "$SVC_PIDFILE"; log "restarted" ;;
    esac
}

if [[ "$SUBCMD" == "service" ]]; then
    if [[ "$SVC_ACTION" == "install" ]]; then
        svc_install
    else
        svc_action
    fi
    exit 0
fi

# ── Preflight ────────────────────────────────────────────────────────────────
command -v curl >/dev/null || die "curl is required (apt install curl)"
command -v nvcc >/dev/null || die "nvcc (CUDA toolkit 12.x) is required to build the GPU kernel.
  On Ubuntu CUDA images it is preinstalled; otherwise see https://developer.nvidia.com/cuda-downloads"
if command -v nvidia-smi >/dev/null; then
    GPU_NAME="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 || true)"
    [[ -n "$GPU_NAME" && "$GPU_NAME" != *H100* ]] \
        && echo "WARNING: detected '$GPU_NAME' — the scan kernel currently targets H100 (sm_90)" >&2
fi

self_update

# ── Source install: bundled src/ first, git clone fallback ──────────────────
if [[ -f "$BUNDLED_SRC/BUNDLE_COMMIT" ]]; then
    WANT="$(cat "$BUNDLED_SRC/BUNDLE_COMMIT")"
    HAVE="$(cat "$SRC_DIR/BUNDLE_COMMIT" 2>/dev/null || true)"
    if [[ "$WANT" != "$HAVE" ]]; then
        log "installing bundled miner source (${WANT:0:12}) → $SRC_DIR"
        rm -rf "$SRC_DIR"
        mkdir -p "$SRC_DIR"
        cp -a "$BUNDLED_SRC/." "$SRC_DIR/"
    fi
elif [[ ! -d "$SRC_DIR/.git" && ! -f "$SRC_DIR/BUNDLE_COMMIT" ]]; then
    command -v git >/dev/null || die "no bundled src/ next to the script and git is missing"
    log "no bundled src/ — fetching miner source ($REPO_URL @ $BRANCH)"
    mkdir -p "$HOME_DIR"
    CLONE_URL="$REPO_URL"
    if [[ -n "${PEARL_GIT_TOKEN:-}" ]]; then
        CLONE_URL="${REPO_URL/https:\/\//https://x-access-token:${PEARL_GIT_TOKEN}@}"
    fi
    git clone --depth 1 --branch "$BRANCH" "$CLONE_URL" "$SRC_DIR"
    git -C "$SRC_DIR" remote set-url origin "$REPO_URL"   # never persist the token
    git -C "$SRC_DIR" submodule update --init miner/pearl-gemm/third_party/cutlass
elif [[ -d "$SRC_DIR/.git" ]]; then
    log "updating miner source (git dev mode)"
    git -C "$SRC_DIR" fetch --depth 1 origin "$BRANCH" || log "WARNING: fetch failed, using cached source"
    git -C "$SRC_DIR" reset --hard FETCH_HEAD 2>/dev/null || true
    git -C "$SRC_DIR" submodule update --init miner/pearl-gemm/third_party/cutlass || true
fi

# ── Toolchain: uv (Python) + rustup (py-pearl-mining ext) ────────────────────
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if ! command -v uv >/dev/null; then
    log "installing uv (Python package manager)"
    curl -LsSf https://astral.sh/uv/install.sh | sh
fi
if ! command -v cargo >/dev/null; then
    log "installing Rust toolchain (builds the pearl_mining extension)"
    curl -LsSf https://sh.rustup.rs | sh -s -- -y --profile minimal
    . "$HOME/.cargo/env"
fi

# ── Build (first run compiles the CUDA kernel — takes a while, then cached) ──
VENV_BIN="$SRC_DIR/.venv/bin/pure-gemm-scanner"
if [[ ! -x "$VENV_BIN" ]] || [[ "$SRC_DIR/uv.lock" -nt "$SRC_DIR/.venv" ]]; then
    log "building miner (uv sync --package pure-gemm-scanner)"
    ( cd "$SRC_DIR" && uv sync --package pure-gemm-scanner )
fi

# ── GPU selection ────────────────────────────────────────────────────────────
if [[ "$GPUS" != "all" ]]; then
    export CUDA_VISIBLE_DEVICES="$GPUS"
fi

# ── Run ──────────────────────────────────────────────────────────────────────
log "→ $SUBCMD host=$HOST wallet=$WALLET worker=$WORKER gpus=$GPUS"
exec "$VENV_BIN" "$SUBCMD" \
    --host "$HOST" \
    --wallet "$WALLET" \
    --worker "$WORKER" \
    ${TLS:+$TLS}
