Files
xingrin/backend/scripts/worker-deploy/agent.sh

249 lines
8.1 KiB
Bash
Raw Normal View History

2025-12-12 18:04:57 +08:00
#!/bin/bash
# ============================================
# XingRin Agent
2025-12-19 19:20:15 +08:00
# 用途:心跳上报 + 负载监控 + 版本检查
2025-12-12 18:04:57 +08:00
# 适用:远程 VPS 或 Docker 容器内
# ============================================
# 检查是否禁用 Agent
if [ "${AGENT_DISABLED:-false}" = "true" ]; then
echo "[AGENT] 已禁用,跳过启动"
exit 0
fi
# 配置
MARKER_DIR="/opt/xingrin"
SRC_DIR="${MARKER_DIR}/src"
ENV_FILE="${SRC_DIR}/backend/.env"
INTERVAL=${AGENT_INTERVAL:-3}
2025-12-19 19:20:15 +08:00
# Agent 版本(从环境变量获取,由 Docker 镜像构建时注入)
AGENT_VERSION="${IMAGE_TAG:-unknown}"
2025-12-12 18:04:57 +08:00
# 颜色定义
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
log() {
echo -e "[$(date +'%Y-%m-%d %H:%M:%S')] [AGENT] $1"
}
# 检测运行模式:容器内 or 远程 VPS
# 如果 /.dockerenv 存在,说明在容器内
if [ -f "/.dockerenv" ]; then
RUN_MODE="container"
log "运行模式: Docker 容器内"
else
RUN_MODE="remote"
log "运行模式: 远程 VPS"
# 远程模式:检测 Docker 命令
if docker info >/dev/null 2>&1; then
DOCKER_CMD="docker"
else
DOCKER_CMD="sudo docker"
fi
fi
# 加载环境变量(远程模式从文件,容器模式从环境变量)
if [ "$RUN_MODE" = "remote" ] && [ -f "$ENV_FILE" ]; then
set -a
source "$ENV_FILE"
set +a
fi
# 获取配置
2025-12-19 18:37:05 +08:00
# SERVER_URL: 后端 API 地址(容器内用 http://server:8888远程用 https://{PUBLIC_HOST}
2025-12-12 18:04:57 +08:00
API_URL="${HEARTBEAT_API_URL:-${SERVER_URL:-}}"
WORKER_NAME="${WORKER_NAME:-}"
IS_LOCAL="${IS_LOCAL:-false}"
# 容器模式默认标记为本地节点
if [ "$RUN_MODE" = "container" ]; then
IS_LOCAL="true"
fi
log "${GREEN}Agent 启动...${NC}"
log "心跳间隔: ${INTERVAL}s"
if [ -z "$API_URL" ]; then
log "${RED}错误: 未配置 API 地址 (HEARTBEAT_API_URL 或 SERVER_URL)${NC}"
exit 1
fi
log "API 地址: ${API_URL}"
# ============================================
# 自注册功能(如果 WORKER_ID 未设置)
# ============================================
register_worker() {
if [ -z "$WORKER_NAME" ]; then
WORKER_NAME="Worker-$(hostname)"
fi
log "注册 Worker: ${WORKER_NAME}..."
REGISTER_DATA=$(cat <<EOF
{
"name": "$WORKER_NAME",
"is_local": $IS_LOCAL
}
EOF
)
2025-12-19 19:41:12 +08:00
RESPONSE=$(curl -k -s -X POST \
2025-12-12 18:04:57 +08:00
-H "Content-Type: application/json" \
-d "$REGISTER_DATA" \
"${API_URL}/api/workers/register/" 2>/dev/null)
if [ $? -eq 0 ]; then
# 解析返回的 workerIdAPI 使用 camelCase
WORKER_ID=$(echo "$RESPONSE" | grep -oE '"workerId":\s*[0-9]+' | grep -oE '[0-9]+')
if [ -n "$WORKER_ID" ]; then
log "${GREEN}注册成功: ${WORKER_NAME} (ID: ${WORKER_ID})${NC}"
return 0
fi
fi
log "${RED}注册失败: ${RESPONSE}${NC}"
return 1
}
# 如果没有 WORKER_ID执行自注册
if [ -z "$WORKER_ID" ]; then
# 等待 Server 就绪
log "等待 Server 就绪..."
for i in $(seq 1 30); do
2025-12-19 18:37:05 +08:00
if curl -k -s "${API_URL}/api/" > /dev/null 2>&1; then
2025-12-12 18:04:57 +08:00
log "${GREEN}Server 已就绪${NC}"
break
fi
log "Server 未就绪,等待... ($i/30)"
sleep 5
done
# 注册
while ! register_worker; do
log "${YELLOW}注册失败5 秒后重试...${NC}"
sleep 5
done
fi
log "Worker ID: ${WORKER_ID}"
# ============================================
# 心跳循环
# Agent 独立运行,始终发送心跳
# 主服务器根据心跳数据选择负载最低的节点分发任务
# ============================================
while true; do
# 收集系统负载CPU + 内存)
# 容器内使用挂载的 /host/proc 获取宿主机数据
if [ -d "/host/proc" ]; then
PROC_DIR="/host/proc"
else
PROC_DIR="/proc"
fi
# CPU 使用率(百分比数值)
# /proc/stat 是累计值,需要两次采样计算差值
CPU_STAT1=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}')
sleep 0.5
CPU_STAT2=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}')
CPU_PERCENT=$(echo "$CPU_STAT1 $CPU_STAT2" | awk '{
user1=$1; nice1=$2; sys1=$3; idle1=$4; iowait1=$5; irq1=$6; softirq1=$7;
user2=$8; nice2=$9; sys2=$10; idle2=$11; iowait2=$12; irq2=$13; softirq2=$14;
total1=user1+nice1+sys1+idle1+iowait1+irq1+softirq1;
total2=user2+nice2+sys2+idle2+iowait2+irq2+softirq2;
idle_diff=idle2-idle1;
total_diff=total2-total1;
if(total_diff>0) printf "%.1f", (1-idle_diff/total_diff)*100;
else printf "0.0";
}')
2025-12-12 18:04:57 +08:00
# 内存使用率(百分比数值)
if [ -d "/host/proc" ]; then
# 从 /host/proc/meminfo 读取
MEM_TOTAL=$(grep 'MemTotal' ${PROC_DIR}/meminfo | awk '{print $2}')
MEM_AVAILABLE=$(grep 'MemAvailable' ${PROC_DIR}/meminfo | awk '{print $2}')
MEM_PERCENT=$(awk "BEGIN {printf \"%.1f\", 100 - ($MEM_AVAILABLE / $MEM_TOTAL * 100)}")
else
# 使用 free 命令
MEM_PERCENT=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100}')
fi
# 构建 JSON 数据(使用数值而非字符串,便于比较和排序)
2025-12-19 19:20:15 +08:00
# 包含版本号,供 Server 端检查版本一致性
2025-12-12 18:04:57 +08:00
JSON_DATA=$(cat <<EOF
{
"cpu_percent": $CPU_PERCENT,
2025-12-19 19:20:15 +08:00
"memory_percent": $MEM_PERCENT,
"version": "$AGENT_VERSION"
2025-12-12 18:04:57 +08:00
}
EOF
)
2025-12-19 19:20:15 +08:00
# 发送心跳,获取响应内容
RESPONSE_FILE=$(mktemp)
HTTP_CODE=$(curl -k -s -o "$RESPONSE_FILE" -w "%{http_code}" -X POST \
2025-12-12 18:04:57 +08:00
-H "Content-Type: application/json" \
-d "$JSON_DATA" \
"${API_URL}/api/workers/${WORKER_ID}/heartbeat/" 2>/dev/null || echo "000")
2025-12-19 19:20:15 +08:00
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null)
rm -f "$RESPONSE_FILE"
2025-12-12 18:04:57 +08:00
2025-12-19 19:20:15 +08:00
if [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "201" ]; then
log "${YELLOW}心跳发送失败 (HTTP $HTTP_CODE)${NC}"
else
# 检查是否需要更新
NEED_UPDATE=$(echo "$RESPONSE_BODY" | grep -oE '"need_update":\s*(true|false)' | grep -oE '(true|false)')
if [ "$NEED_UPDATE" = "true" ]; then
SERVER_VERSION=$(echo "$RESPONSE_BODY" | grep -oE '"server_version":\s*"[^"]+"' | sed 's/.*"\([^"]*\)"$/\1/')
log "${YELLOW}检测到版本不匹配: Agent=$AGENT_VERSION, Server=$SERVER_VERSION${NC}"
log "${GREEN}正在自动更新...${NC}"
# 执行自动更新
if [ "$RUN_MODE" = "container" ]; then
# 容器模式:通知外部重启(退出后由 docker-compose restart policy 重启)
log "容器模式:退出以触发重启更新"
exit 0
else
# 远程模式:拉取新镜像并重启 agent 容器
log "远程模式:更新 agent 镜像..."
DOCKER_USER="${DOCKER_USER:-yyhuni}"
NEW_IMAGE="${DOCKER_USER}/xingrin-agent:${SERVER_VERSION}"
# 拉取新镜像
if $DOCKER_CMD pull "$NEW_IMAGE" 2>/dev/null; then
log "${GREEN}镜像拉取成功: $NEW_IMAGE${NC}"
# 停止当前容器并用新镜像重启
CONTAINER_NAME="xingrin-agent"
$DOCKER_CMD stop "$CONTAINER_NAME" 2>/dev/null || true
$DOCKER_CMD rm "$CONTAINER_NAME" 2>/dev/null || true
# 重新启动(使用相同的环境变量)
$DOCKER_CMD run -d \
--name "$CONTAINER_NAME" \
--restart unless-stopped \
-e HEARTBEAT_API_URL="$API_URL" \
-e WORKER_ID="$WORKER_ID" \
-e IMAGE_TAG="$SERVER_VERSION" \
-v /proc:/host/proc:ro \
"$NEW_IMAGE"
log "${GREEN}Agent 已更新到 $SERVER_VERSION${NC}"
exit 0
else
log "${RED}镜像拉取失败: $NEW_IMAGE${NC}"
fi
fi
fi
2025-12-12 18:04:57 +08:00
fi
# 休眠
sleep $INTERVAL
done