#!/bin/bash # ============================================ # XingRin Agent # 用途:心跳上报 + 负载监控 + 版本检查 # 适用:远程 VPS 或 Docker 容器内 # ============================================ # 检查是否禁用 Agent if [ "${AGENT_DISABLED:-false}" = "true" ]; then echo "[AGENT] 已禁用,跳过启动" exit 0 fi # 配置 MARKER_DIR="/opt/xingrin" SRC_DIR="${MARKER_DIR}/src" ENV_FILE="${SRC_DIR}/backend/.env" INTERVAL=${AGENT_INTERVAL:-3} # Agent 版本(从环境变量获取,由 Docker 镜像构建时注入) AGENT_VERSION="${IMAGE_TAG:-unknown}" # 颜色定义 GREEN='\033[0;32m' RED='\033[0;31m' YELLOW='\033[0;33m' NC='\033[0m' log() { echo -e "[$(date +'%Y-%m-%d %H:%M:%S')] [AGENT] $1" } # 检测运行模式:容器内 or 远程 VPS # 如果 /.dockerenv 存在,说明在容器内 if [ -f "/.dockerenv" ]; then RUN_MODE="container" log "运行模式: Docker 容器内" else RUN_MODE="remote" log "运行模式: 远程 VPS" # 远程模式:检测 Docker 命令 if docker info >/dev/null 2>&1; then DOCKER_CMD="docker" else DOCKER_CMD="sudo docker" fi fi # 加载环境变量(远程模式从文件,容器模式从环境变量) if [ "$RUN_MODE" = "remote" ] && [ -f "$ENV_FILE" ]; then set -a source "$ENV_FILE" set +a fi # 获取配置 # SERVER_URL: 后端 API 地址(容器内用 http://server:8888,远程用 https://{PUBLIC_HOST}) API_URL="${HEARTBEAT_API_URL:-${SERVER_URL:-}}" WORKER_NAME="${WORKER_NAME:-}" IS_LOCAL="${IS_LOCAL:-false}" # 容器模式默认标记为本地节点 if [ "$RUN_MODE" = "container" ]; then IS_LOCAL="true" fi log "${GREEN}Agent 启动...${NC}" log "心跳间隔: ${INTERVAL}s" if [ -z "$API_URL" ]; then log "${RED}错误: 未配置 API 地址 (HEARTBEAT_API_URL 或 SERVER_URL)${NC}" exit 1 fi log "API 地址: ${API_URL}" # ============================================ # 自注册功能(如果 WORKER_ID 未设置) # ============================================ register_worker() { if [ -z "$WORKER_NAME" ]; then WORKER_NAME="Worker-$(hostname)" fi log "注册 Worker: ${WORKER_NAME}..." REGISTER_DATA=$(cat </dev/null) if [ $? -eq 0 ]; then # 解析返回的 workerId(API 使用 camelCase) WORKER_ID=$(echo "$RESPONSE" | grep -oE '"workerId":\s*[0-9]+' | grep -oE '[0-9]+') if [ -n "$WORKER_ID" ]; then log "${GREEN}注册成功: ${WORKER_NAME} (ID: ${WORKER_ID})${NC}" return 0 fi fi log "${RED}注册失败: ${RESPONSE}${NC}" return 1 } # 如果没有 WORKER_ID,执行自注册 if [ -z "$WORKER_ID" ]; then # 等待 Server 就绪 log "等待 Server 就绪..." for i in $(seq 1 30); do if curl -k -s "${API_URL}/api/" > /dev/null 2>&1; then log "${GREEN}Server 已就绪${NC}" break fi log "Server 未就绪,等待... ($i/30)" sleep 5 done # 注册 while ! register_worker; do log "${YELLOW}注册失败,5 秒后重试...${NC}" sleep 5 done fi log "Worker ID: ${WORKER_ID}" # ============================================ # 心跳循环 # Agent 独立运行,始终发送心跳 # 主服务器根据心跳数据选择负载最低的节点分发任务 # ============================================ while true; do # 收集系统负载(CPU + 内存) # 容器内使用挂载的 /host/proc 获取宿主机数据 if [ -d "/host/proc" ]; then PROC_DIR="/host/proc" else PROC_DIR="/proc" fi # CPU 使用率(百分比数值) # /proc/stat 是累计值,需要两次采样计算差值 CPU_STAT1=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}') sleep 0.5 CPU_STAT2=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}') CPU_PERCENT=$(echo "$CPU_STAT1 $CPU_STAT2" | awk '{ user1=$1; nice1=$2; sys1=$3; idle1=$4; iowait1=$5; irq1=$6; softirq1=$7; user2=$8; nice2=$9; sys2=$10; idle2=$11; iowait2=$12; irq2=$13; softirq2=$14; total1=user1+nice1+sys1+idle1+iowait1+irq1+softirq1; total2=user2+nice2+sys2+idle2+iowait2+irq2+softirq2; idle_diff=idle2-idle1; total_diff=total2-total1; if(total_diff>0) printf "%.1f", (1-idle_diff/total_diff)*100; else printf "0.0"; }') # 内存使用率(百分比数值) if [ -d "/host/proc" ]; then # 从 /host/proc/meminfo 读取 MEM_TOTAL=$(grep 'MemTotal' ${PROC_DIR}/meminfo | awk '{print $2}') MEM_AVAILABLE=$(grep 'MemAvailable' ${PROC_DIR}/meminfo | awk '{print $2}') MEM_PERCENT=$(awk "BEGIN {printf \"%.1f\", 100 - ($MEM_AVAILABLE / $MEM_TOTAL * 100)}") else # 使用 free 命令 MEM_PERCENT=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100}') fi # 构建 JSON 数据(使用数值而非字符串,便于比较和排序) # 包含版本号,供 Server 端检查版本一致性 JSON_DATA=$(cat </dev/null || echo "000") RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null) rm -f "$RESPONSE_FILE" if [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "201" ]; then log "${YELLOW}心跳发送失败 (HTTP $HTTP_CODE)${NC}" else # 检查是否需要更新 NEED_UPDATE=$(echo "$RESPONSE_BODY" | grep -oE '"need_update":\s*(true|false)' | grep -oE '(true|false)') if [ "$NEED_UPDATE" = "true" ]; then SERVER_VERSION=$(echo "$RESPONSE_BODY" | grep -oE '"server_version":\s*"[^"]+"' | sed 's/.*"\([^"]*\)"$/\1/') log "${YELLOW}检测到版本不匹配: Agent=$AGENT_VERSION, Server=$SERVER_VERSION${NC}" log "${GREEN}正在自动更新...${NC}" # 执行自动更新 if [ "$RUN_MODE" = "container" ]; then # 容器模式:通知外部重启(退出后由 docker-compose restart policy 重启) log "容器模式:退出以触发重启更新" exit 0 else # 远程模式:拉取新镜像并重启 agent 容器 log "远程模式:更新 agent 镜像..." DOCKER_USER="${DOCKER_USER:-yyhuni}" NEW_IMAGE="${DOCKER_USER}/xingrin-agent:${SERVER_VERSION}" # 拉取新镜像 if $DOCKER_CMD pull "$NEW_IMAGE" 2>/dev/null; then log "${GREEN}镜像拉取成功: $NEW_IMAGE${NC}" # 停止当前容器并用新镜像重启 CONTAINER_NAME="xingrin-agent" $DOCKER_CMD stop "$CONTAINER_NAME" 2>/dev/null || true $DOCKER_CMD rm "$CONTAINER_NAME" 2>/dev/null || true # 重新启动(使用相同的环境变量) $DOCKER_CMD run -d \ --name "$CONTAINER_NAME" \ --restart unless-stopped \ -e HEARTBEAT_API_URL="$API_URL" \ -e WORKER_ID="$WORKER_ID" \ -e IMAGE_TAG="$SERVER_VERSION" \ -v /proc:/host/proc:ro \ "$NEW_IMAGE" log "${GREEN}Agent 已更新到 $SERVER_VERSION${NC}" exit 0 else log "${RED}镜像拉取失败: $NEW_IMAGE${NC}" fi fi fi fi # 休眠 sleep $INTERVAL done