2025-12-12 18:04:57 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
# XingRin Agent
|
2025-12-19 19:20:15 +08:00
|
|
|
|
# 用途:心跳上报 + 负载监控 + 版本检查
|
2025-12-12 18:04:57 +08:00
|
|
|
|
# 适用:远程 VPS 或 Docker 容器内
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否禁用 Agent
|
|
|
|
|
|
if [ "${AGENT_DISABLED:-false}" = "true" ]; then
|
|
|
|
|
|
echo "[AGENT] 已禁用,跳过启动"
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 配置
|
|
|
|
|
|
MARKER_DIR="/opt/xingrin"
|
|
|
|
|
|
SRC_DIR="${MARKER_DIR}/src"
|
|
|
|
|
|
ENV_FILE="${SRC_DIR}/backend/.env"
|
|
|
|
|
|
INTERVAL=${AGENT_INTERVAL:-3}
|
|
|
|
|
|
|
2025-12-19 19:20:15 +08:00
|
|
|
|
# Agent 版本(从环境变量获取,由 Docker 镜像构建时注入)
|
|
|
|
|
|
AGENT_VERSION="${IMAGE_TAG:-unknown}"
|
|
|
|
|
|
|
2025-12-12 18:04:57 +08:00
|
|
|
|
# 颜色定义
|
|
|
|
|
|
GREEN='\033[0;32m'
|
|
|
|
|
|
RED='\033[0;31m'
|
|
|
|
|
|
YELLOW='\033[0;33m'
|
|
|
|
|
|
NC='\033[0m'
|
|
|
|
|
|
|
|
|
|
|
|
log() {
|
|
|
|
|
|
echo -e "[$(date +'%Y-%m-%d %H:%M:%S')] [AGENT] $1"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 检测运行模式:容器内 or 远程 VPS
|
|
|
|
|
|
# 如果 /.dockerenv 存在,说明在容器内
|
|
|
|
|
|
if [ -f "/.dockerenv" ]; then
|
|
|
|
|
|
RUN_MODE="container"
|
|
|
|
|
|
log "运行模式: Docker 容器内"
|
|
|
|
|
|
else
|
|
|
|
|
|
RUN_MODE="remote"
|
|
|
|
|
|
log "运行模式: 远程 VPS"
|
|
|
|
|
|
|
|
|
|
|
|
# 远程模式:检测 Docker 命令
|
|
|
|
|
|
if docker info >/dev/null 2>&1; then
|
|
|
|
|
|
DOCKER_CMD="docker"
|
|
|
|
|
|
else
|
|
|
|
|
|
DOCKER_CMD="sudo docker"
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 加载环境变量(远程模式从文件,容器模式从环境变量)
|
|
|
|
|
|
if [ "$RUN_MODE" = "remote" ] && [ -f "$ENV_FILE" ]; then
|
|
|
|
|
|
set -a
|
|
|
|
|
|
source "$ENV_FILE"
|
|
|
|
|
|
set +a
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 获取配置
|
2025-12-19 18:37:05 +08:00
|
|
|
|
# SERVER_URL: 后端 API 地址(容器内用 http://server:8888,远程用 https://{PUBLIC_HOST})
|
2025-12-12 18:04:57 +08:00
|
|
|
|
API_URL="${HEARTBEAT_API_URL:-${SERVER_URL:-}}"
|
|
|
|
|
|
WORKER_NAME="${WORKER_NAME:-}"
|
|
|
|
|
|
IS_LOCAL="${IS_LOCAL:-false}"
|
|
|
|
|
|
|
|
|
|
|
|
# 容器模式默认标记为本地节点
|
|
|
|
|
|
if [ "$RUN_MODE" = "container" ]; then
|
|
|
|
|
|
IS_LOCAL="true"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
log "${GREEN}Agent 启动...${NC}"
|
|
|
|
|
|
log "心跳间隔: ${INTERVAL}s"
|
|
|
|
|
|
|
|
|
|
|
|
if [ -z "$API_URL" ]; then
|
|
|
|
|
|
log "${RED}错误: 未配置 API 地址 (HEARTBEAT_API_URL 或 SERVER_URL)${NC}"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
log "API 地址: ${API_URL}"
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
# 自注册功能(如果 WORKER_ID 未设置)
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
register_worker() {
|
|
|
|
|
|
if [ -z "$WORKER_NAME" ]; then
|
|
|
|
|
|
WORKER_NAME="Worker-$(hostname)"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
log "注册 Worker: ${WORKER_NAME}..."
|
|
|
|
|
|
|
|
|
|
|
|
REGISTER_DATA=$(cat <<EOF
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "$WORKER_NAME",
|
|
|
|
|
|
"is_local": $IS_LOCAL
|
|
|
|
|
|
}
|
|
|
|
|
|
EOF
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-12-19 19:41:12 +08:00
|
|
|
|
RESPONSE=$(curl -k -s -X POST \
|
2025-12-12 18:04:57 +08:00
|
|
|
|
-H "Content-Type: application/json" \
|
|
|
|
|
|
-d "$REGISTER_DATA" \
|
|
|
|
|
|
"${API_URL}/api/workers/register/" 2>/dev/null)
|
|
|
|
|
|
|
|
|
|
|
|
if [ $? -eq 0 ]; then
|
|
|
|
|
|
# 解析返回的 workerId(API 使用 camelCase)
|
|
|
|
|
|
WORKER_ID=$(echo "$RESPONSE" | grep -oE '"workerId":\s*[0-9]+' | grep -oE '[0-9]+')
|
|
|
|
|
|
if [ -n "$WORKER_ID" ]; then
|
|
|
|
|
|
log "${GREEN}注册成功: ${WORKER_NAME} (ID: ${WORKER_ID})${NC}"
|
|
|
|
|
|
return 0
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
log "${RED}注册失败: ${RESPONSE}${NC}"
|
|
|
|
|
|
return 1
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 如果没有 WORKER_ID,执行自注册
|
|
|
|
|
|
if [ -z "$WORKER_ID" ]; then
|
|
|
|
|
|
# 等待 Server 就绪
|
|
|
|
|
|
log "等待 Server 就绪..."
|
|
|
|
|
|
for i in $(seq 1 30); do
|
2025-12-19 18:37:05 +08:00
|
|
|
|
if curl -k -s "${API_URL}/api/" > /dev/null 2>&1; then
|
2025-12-12 18:04:57 +08:00
|
|
|
|
log "${GREEN}Server 已就绪${NC}"
|
|
|
|
|
|
break
|
|
|
|
|
|
fi
|
|
|
|
|
|
log "Server 未就绪,等待... ($i/30)"
|
|
|
|
|
|
sleep 5
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 注册
|
|
|
|
|
|
while ! register_worker; do
|
|
|
|
|
|
log "${YELLOW}注册失败,5 秒后重试...${NC}"
|
|
|
|
|
|
sleep 5
|
|
|
|
|
|
done
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
log "Worker ID: ${WORKER_ID}"
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
# 心跳循环
|
|
|
|
|
|
# Agent 独立运行,始终发送心跳
|
|
|
|
|
|
# 主服务器根据心跳数据选择负载最低的节点分发任务
|
|
|
|
|
|
# ============================================
|
|
|
|
|
|
while true; do
|
|
|
|
|
|
# 收集系统负载(CPU + 内存)
|
|
|
|
|
|
# 容器内使用挂载的 /host/proc 获取宿主机数据
|
|
|
|
|
|
if [ -d "/host/proc" ]; then
|
|
|
|
|
|
PROC_DIR="/host/proc"
|
|
|
|
|
|
else
|
|
|
|
|
|
PROC_DIR="/proc"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# CPU 使用率(百分比数值)
|
2025-12-12 20:19:51 +08:00
|
|
|
|
# /proc/stat 是累计值,需要两次采样计算差值
|
|
|
|
|
|
CPU_STAT1=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}')
|
|
|
|
|
|
sleep 0.5
|
|
|
|
|
|
CPU_STAT2=$(grep 'cpu ' ${PROC_DIR}/stat | awk '{print $2,$3,$4,$5,$6,$7,$8}')
|
|
|
|
|
|
CPU_PERCENT=$(echo "$CPU_STAT1 $CPU_STAT2" | awk '{
|
|
|
|
|
|
user1=$1; nice1=$2; sys1=$3; idle1=$4; iowait1=$5; irq1=$6; softirq1=$7;
|
|
|
|
|
|
user2=$8; nice2=$9; sys2=$10; idle2=$11; iowait2=$12; irq2=$13; softirq2=$14;
|
|
|
|
|
|
total1=user1+nice1+sys1+idle1+iowait1+irq1+softirq1;
|
|
|
|
|
|
total2=user2+nice2+sys2+idle2+iowait2+irq2+softirq2;
|
|
|
|
|
|
idle_diff=idle2-idle1;
|
|
|
|
|
|
total_diff=total2-total1;
|
|
|
|
|
|
if(total_diff>0) printf "%.1f", (1-idle_diff/total_diff)*100;
|
|
|
|
|
|
else printf "0.0";
|
|
|
|
|
|
}')
|
2025-12-12 18:04:57 +08:00
|
|
|
|
|
|
|
|
|
|
# 内存使用率(百分比数值)
|
|
|
|
|
|
if [ -d "/host/proc" ]; then
|
|
|
|
|
|
# 从 /host/proc/meminfo 读取
|
|
|
|
|
|
MEM_TOTAL=$(grep 'MemTotal' ${PROC_DIR}/meminfo | awk '{print $2}')
|
|
|
|
|
|
MEM_AVAILABLE=$(grep 'MemAvailable' ${PROC_DIR}/meminfo | awk '{print $2}')
|
|
|
|
|
|
MEM_PERCENT=$(awk "BEGIN {printf \"%.1f\", 100 - ($MEM_AVAILABLE / $MEM_TOTAL * 100)}")
|
|
|
|
|
|
else
|
|
|
|
|
|
# 使用 free 命令
|
|
|
|
|
|
MEM_PERCENT=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100}')
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 构建 JSON 数据(使用数值而非字符串,便于比较和排序)
|
2025-12-19 19:20:15 +08:00
|
|
|
|
# 包含版本号,供 Server 端检查版本一致性
|
2025-12-12 18:04:57 +08:00
|
|
|
|
JSON_DATA=$(cat <<EOF
|
|
|
|
|
|
{
|
|
|
|
|
|
"cpu_percent": $CPU_PERCENT,
|
2025-12-19 19:20:15 +08:00
|
|
|
|
"memory_percent": $MEM_PERCENT,
|
|
|
|
|
|
"version": "$AGENT_VERSION"
|
2025-12-12 18:04:57 +08:00
|
|
|
|
}
|
|
|
|
|
|
EOF
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-12-19 19:20:15 +08:00
|
|
|
|
# 发送心跳,获取响应内容
|
|
|
|
|
|
RESPONSE_FILE=$(mktemp)
|
|
|
|
|
|
HTTP_CODE=$(curl -k -s -o "$RESPONSE_FILE" -w "%{http_code}" -X POST \
|
2025-12-12 18:04:57 +08:00
|
|
|
|
-H "Content-Type: application/json" \
|
|
|
|
|
|
-d "$JSON_DATA" \
|
|
|
|
|
|
"${API_URL}/api/workers/${WORKER_ID}/heartbeat/" 2>/dev/null || echo "000")
|
2025-12-19 19:20:15 +08:00
|
|
|
|
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null)
|
|
|
|
|
|
rm -f "$RESPONSE_FILE"
|
2025-12-12 18:04:57 +08:00
|
|
|
|
|
2025-12-19 19:20:15 +08:00
|
|
|
|
if [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "201" ]; then
|
|
|
|
|
|
log "${YELLOW}心跳发送失败 (HTTP $HTTP_CODE)${NC}"
|
|
|
|
|
|
else
|
|
|
|
|
|
# 检查是否需要更新
|
|
|
|
|
|
NEED_UPDATE=$(echo "$RESPONSE_BODY" | grep -oE '"need_update":\s*(true|false)' | grep -oE '(true|false)')
|
|
|
|
|
|
if [ "$NEED_UPDATE" = "true" ]; then
|
|
|
|
|
|
SERVER_VERSION=$(echo "$RESPONSE_BODY" | grep -oE '"server_version":\s*"[^"]+"' | sed 's/.*"\([^"]*\)"$/\1/')
|
|
|
|
|
|
log "${YELLOW}检测到版本不匹配: Agent=$AGENT_VERSION, Server=$SERVER_VERSION${NC}"
|
|
|
|
|
|
log "${GREEN}正在自动更新...${NC}"
|
|
|
|
|
|
|
|
|
|
|
|
# 执行自动更新
|
|
|
|
|
|
if [ "$RUN_MODE" = "container" ]; then
|
|
|
|
|
|
# 容器模式:通知外部重启(退出后由 docker-compose restart policy 重启)
|
|
|
|
|
|
log "容器模式:退出以触发重启更新"
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
else
|
|
|
|
|
|
# 远程模式:拉取新镜像并重启 agent 容器
|
|
|
|
|
|
log "远程模式:更新 agent 镜像..."
|
|
|
|
|
|
DOCKER_USER="${DOCKER_USER:-yyhuni}"
|
|
|
|
|
|
NEW_IMAGE="${DOCKER_USER}/xingrin-agent:${SERVER_VERSION}"
|
|
|
|
|
|
|
|
|
|
|
|
# 拉取新镜像
|
|
|
|
|
|
if $DOCKER_CMD pull "$NEW_IMAGE" 2>/dev/null; then
|
|
|
|
|
|
log "${GREEN}镜像拉取成功: $NEW_IMAGE${NC}"
|
|
|
|
|
|
|
|
|
|
|
|
# 停止当前容器并用新镜像重启
|
|
|
|
|
|
CONTAINER_NAME="xingrin-agent"
|
|
|
|
|
|
$DOCKER_CMD stop "$CONTAINER_NAME" 2>/dev/null || true
|
|
|
|
|
|
$DOCKER_CMD rm "$CONTAINER_NAME" 2>/dev/null || true
|
|
|
|
|
|
|
|
|
|
|
|
# 重新启动(使用相同的环境变量)
|
|
|
|
|
|
$DOCKER_CMD run -d \
|
|
|
|
|
|
--name "$CONTAINER_NAME" \
|
|
|
|
|
|
--restart unless-stopped \
|
|
|
|
|
|
-e HEARTBEAT_API_URL="$API_URL" \
|
|
|
|
|
|
-e WORKER_ID="$WORKER_ID" \
|
|
|
|
|
|
-e IMAGE_TAG="$SERVER_VERSION" \
|
|
|
|
|
|
-v /proc:/host/proc:ro \
|
|
|
|
|
|
"$NEW_IMAGE"
|
|
|
|
|
|
|
|
|
|
|
|
log "${GREEN}Agent 已更新到 $SERVER_VERSION${NC}"
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
else
|
|
|
|
|
|
log "${RED}镜像拉取失败: $NEW_IMAGE${NC}"
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
2025-12-12 18:04:57 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 休眠
|
|
|
|
|
|
sleep $INTERVAL
|
|
|
|
|
|
done
|