mirror of
https://github.com/yyhuni/xingrin.git
synced 2026-01-31 11:46:16 +08:00
457 lines
17 KiB
Python
457 lines
17 KiB
Python
"""
|
||
WebSocket Consumer - Worker 交互式终端 (使用 PTY)
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import asyncio
|
||
import os
|
||
from channels.generic.websocket import AsyncWebsocketConsumer
|
||
from asgiref.sync import sync_to_async
|
||
|
||
from django.conf import settings
|
||
|
||
from apps.engine.services import WorkerService
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class WorkerDeployConsumer(AsyncWebsocketConsumer):
|
||
"""
|
||
Worker 交互式终端 WebSocket Consumer
|
||
|
||
使用 paramiko invoke_shell 实现真正的交互式终端
|
||
"""
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
self.ssh_client = None
|
||
self.shell = None
|
||
self.worker = None
|
||
self.read_task = None
|
||
self.worker_service = WorkerService()
|
||
|
||
async def connect(self):
|
||
"""连接时加入对应 Worker 的组并自动建立 SSH 连接"""
|
||
self.worker_id = self.scope['url_route']['kwargs']['worker_id']
|
||
self.group_name = f'worker_deploy_{self.worker_id}'
|
||
|
||
await self.channel_layer.group_add(self.group_name, self.channel_name)
|
||
await self.accept()
|
||
|
||
logger.info(f"终端已连接 - Worker: {self.worker_id}")
|
||
|
||
# 自动建立 SSH 连接
|
||
await self._auto_ssh_connect()
|
||
|
||
async def disconnect(self, close_code):
|
||
"""断开时清理资源"""
|
||
if self.read_task:
|
||
self.read_task.cancel()
|
||
if self.shell:
|
||
try:
|
||
self.shell.close()
|
||
except Exception:
|
||
pass
|
||
if self.ssh_client:
|
||
try:
|
||
self.ssh_client.close()
|
||
except Exception:
|
||
pass
|
||
|
||
await self.channel_layer.group_discard(self.group_name, self.channel_name)
|
||
logger.info(f"终端已断开 - Worker: {self.worker_id}")
|
||
|
||
async def receive(self, text_data=None, bytes_data=None):
|
||
"""接收客户端消息"""
|
||
if bytes_data:
|
||
# 二进制数据直接发送到 shell
|
||
if self.shell:
|
||
await asyncio.to_thread(self.shell.send, bytes_data)
|
||
return
|
||
|
||
if not text_data:
|
||
return
|
||
|
||
try:
|
||
data = json.loads(text_data)
|
||
msg_type = data.get('type')
|
||
|
||
if msg_type == 'resize':
|
||
cols = data.get('cols', 80)
|
||
rows = data.get('rows', 24)
|
||
if self.shell:
|
||
await asyncio.to_thread(self.shell.resize_pty, cols, rows)
|
||
|
||
elif msg_type == 'input':
|
||
# 终端输入
|
||
if self.shell:
|
||
text = data.get('data', '')
|
||
await asyncio.to_thread(self.shell.send, text)
|
||
|
||
elif msg_type == 'deploy':
|
||
# 执行部署脚本(后台运行)
|
||
await self._run_deploy_script()
|
||
|
||
elif msg_type == 'attach':
|
||
# 查看部署进度(attach 到 tmux 会话)
|
||
await self._attach_deploy_session()
|
||
|
||
elif msg_type == 'uninstall':
|
||
# 执行卸载脚本(后台运行)
|
||
await self._run_uninstall_script()
|
||
|
||
except json.JSONDecodeError:
|
||
# 可能是普通文本输入
|
||
if self.shell and text_data:
|
||
await asyncio.to_thread(self.shell.send, text_data)
|
||
except Exception as e:
|
||
logger.error(f"处理消息错误: {e}")
|
||
|
||
async def _auto_ssh_connect(self):
|
||
"""自动从数据库读取密码并连接"""
|
||
logger.info(f"[SSH] 开始自动连接 - Worker ID: {self.worker_id}")
|
||
# 通过服务层获取 Worker 节点
|
||
# thread_sensitive=False 确保在新线程中运行,避免数据库连接问题
|
||
self.worker = await sync_to_async(self.worker_service.get_worker, thread_sensitive=False)(self.worker_id)
|
||
logger.info(f"[SSH] Worker 查询结果: {self.worker}")
|
||
|
||
if not self.worker:
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': 'Worker 不存在'
|
||
}))
|
||
return
|
||
|
||
if not self.worker.password:
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': '未配置 SSH 密码,请先编辑节点信息'
|
||
}))
|
||
return
|
||
|
||
# 使用默认终端大小
|
||
await self._ssh_connect(self.worker.password, 80, 24)
|
||
|
||
async def _ssh_connect(self, password: str, cols: int = 80, rows: int = 24):
|
||
"""建立 SSH 连接并启动交互式 shell (使用 tmux 持久化会话)"""
|
||
try:
|
||
import paramiko
|
||
except ImportError:
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': '服务器缺少 paramiko 库'
|
||
}))
|
||
return
|
||
|
||
# self.worker 已在 _auto_ssh_connect 中查询
|
||
try:
|
||
ssh = paramiko.SSHClient()
|
||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||
|
||
await asyncio.to_thread(
|
||
ssh.connect,
|
||
self.worker.ip_address,
|
||
port=self.worker.ssh_port,
|
||
username=self.worker.username,
|
||
password=password,
|
||
timeout=30
|
||
)
|
||
|
||
self.ssh_client = ssh
|
||
|
||
# 启动交互式 shell(连接时不做 tmux 安装,仅提供普通 shell)
|
||
self.shell = await asyncio.to_thread(
|
||
ssh.invoke_shell,
|
||
term='xterm-256color',
|
||
width=cols,
|
||
height=rows
|
||
)
|
||
|
||
# 发送连接成功消息
|
||
logger.info(f"[SSH] 准备发送 connected 消息 - Worker: {self.worker_id}")
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'connected'
|
||
}))
|
||
logger.info(f"[SSH] connected 消息已发送 - Worker: {self.worker_id}")
|
||
|
||
# 启动读取任务
|
||
self.read_task = asyncio.create_task(self._read_shell_output())
|
||
|
||
logger.info(f"[SSH] Shell 已连接,读取任务已启动 - Worker: {self.worker_id}")
|
||
|
||
except paramiko.AuthenticationException:
|
||
logger.error(f"[SSH] 认证失败 - Worker: {self.worker_id}")
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': '认证失败,密码错误'
|
||
}))
|
||
except Exception as e:
|
||
logger.error(f"[SSH] 连接失败 - Worker: {self.worker_id}, Error: {e}")
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': f'连接失败: {str(e)}'
|
||
}))
|
||
|
||
async def _read_shell_output(self):
|
||
"""持续读取 shell 输出并发送到客户端"""
|
||
try:
|
||
while self.shell and not self.shell.closed:
|
||
if self.shell.recv_ready():
|
||
data = await asyncio.to_thread(self.shell.recv, 4096)
|
||
if data:
|
||
await self.send(bytes_data=data)
|
||
else:
|
||
await asyncio.sleep(0.01)
|
||
except asyncio.CancelledError:
|
||
pass
|
||
except Exception as e:
|
||
logger.error(f"读取 shell 输出错误: {e}")
|
||
|
||
async def _run_deploy_script(self):
|
||
"""运行部署脚本(在 tmux 会话中执行,支持断线续连)
|
||
|
||
流程:
|
||
1. 通过 SFTP 上传脚本到远程服务器
|
||
2. 使用 exec_command 静默执行(不在交互式终端回显)
|
||
3. 通过 WebSocket 发送结果到前端显示
|
||
"""
|
||
if not self.ssh_client:
|
||
return
|
||
|
||
from apps.engine.services.deploy_service import (
|
||
get_bootstrap_script,
|
||
get_deploy_script,
|
||
get_start_agent_script
|
||
)
|
||
|
||
# 优先使用 settings 中配置的对外访问主机(PUBLIC_HOST)拼接 Django URL
|
||
public_host = getattr(settings, 'PUBLIC_HOST', '').strip()
|
||
server_port = getattr(settings, 'SERVER_PORT', '8888')
|
||
|
||
if not public_host:
|
||
error_msg = (
|
||
"未配置 PUBLIC_HOST,请在 docker/.env 中设置对外访问 IP/域名 "
|
||
"(PUBLIC_HOST) 并重启服务后再执行远程部署"
|
||
)
|
||
logger.error(error_msg)
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'error',
|
||
'message': error_msg,
|
||
}))
|
||
return
|
||
|
||
# 远程 Worker 通过 nginx HTTPS 访问(nginx 反代到后端 8888)
|
||
# 使用 https://{PUBLIC_HOST}:{PUBLIC_PORT} 而不是直连 8888 端口
|
||
public_port = getattr(settings, 'PUBLIC_PORT', '8083')
|
||
heartbeat_api_url = f"https://{public_host}:{public_port}"
|
||
|
||
session_name = f'xingrin_deploy_{self.worker_id}'
|
||
remote_script_path = '/tmp/xingrin_deploy.sh'
|
||
|
||
# 获取外置脚本内容
|
||
bootstrap_script = get_bootstrap_script()
|
||
deploy_script = get_deploy_script()
|
||
start_script = get_start_agent_script(
|
||
heartbeat_api_url=heartbeat_api_url,
|
||
worker_id=self.worker_id
|
||
)
|
||
|
||
# 合并脚本
|
||
combined_script = f"""#!/bin/bash
|
||
set -e
|
||
|
||
# ==================== 阶段 1: 环境初始化 ====================
|
||
{bootstrap_script}
|
||
|
||
# ==================== 阶段 2: 安装 Docker ====================
|
||
{deploy_script}
|
||
|
||
# ==================== 阶段 3: 启动 Agent ====================
|
||
{start_script}
|
||
|
||
echo "SUCCESS"
|
||
"""
|
||
|
||
# 更新状态为 deploying
|
||
await sync_to_async(self.worker_service.update_status)(self.worker_id, 'deploying')
|
||
|
||
# 发送开始提示
|
||
start_msg = "\r\n\033[36m[XingRin] 正在准备部署...\033[0m\r\n"
|
||
await self.send(bytes_data=start_msg.encode())
|
||
|
||
try:
|
||
# 1. 上传脚本
|
||
sftp = await asyncio.to_thread(self.ssh_client.open_sftp)
|
||
with sftp.file(remote_script_path, 'w') as f:
|
||
f.write(combined_script)
|
||
sftp.chmod(remote_script_path, 0o755)
|
||
await asyncio.to_thread(sftp.close)
|
||
|
||
# 2. 静默执行部署命令(使用 exec_command,不会回显到终端)
|
||
deploy_cmd = f"""
|
||
# 确保 tmux 安装
|
||
if ! command -v tmux >/dev/null 2>&1; then
|
||
if command -v apt-get >/dev/null 2>&1; then
|
||
sudo apt-get update -qq && sudo apt-get install -y -qq tmux >/dev/null 2>&1
|
||
fi
|
||
fi
|
||
|
||
# 检查脚本是否存在
|
||
if [ ! -f "{remote_script_path}" ]; then
|
||
echo "SCRIPT_NOT_FOUND"
|
||
exit 1
|
||
fi
|
||
|
||
# 启动 tmux 会话
|
||
if command -v tmux >/dev/null 2>&1; then
|
||
tmux kill-session -t {session_name} 2>/dev/null || true
|
||
# 使用 bash 执行脚本,确保环境正确
|
||
tmux new-session -d -s {session_name} "bash {remote_script_path}; echo '部署完成,按回车退出'; read"
|
||
# 验证会话是否创建成功
|
||
sleep 0.5
|
||
if tmux has-session -t {session_name} 2>/dev/null; then
|
||
echo "SUCCESS"
|
||
else
|
||
echo "SESSION_CREATE_FAILED"
|
||
fi
|
||
else
|
||
echo "TMUX_NOT_FOUND"
|
||
fi
|
||
"""
|
||
stdin, stdout, stderr = await asyncio.to_thread(
|
||
self.ssh_client.exec_command, deploy_cmd
|
||
)
|
||
result = await asyncio.to_thread(stdout.read)
|
||
result = result.decode().strip()
|
||
|
||
# 3. 发送结果到前端终端显示
|
||
if "SUCCESS" in result:
|
||
# 部署任务已在后台启动,保持 deploying 状态
|
||
# 只有当心跳上报成功后才会变成 deployed(通过 heartbeat API 自动更新)
|
||
success_msg = (
|
||
"\r\n\033[32m✓ 部署任务已在后台启动\033[0m\r\n"
|
||
f"\033[90m 会话: {session_name}\033[0m\r\n"
|
||
"\r\n"
|
||
"\033[36m点击 [查看进度] 按钮查看部署输出\033[0m\r\n"
|
||
f"\033[90m或手动执行: tmux attach -t {session_name}\033[0m\r\n"
|
||
"\r\n"
|
||
)
|
||
else:
|
||
# 获取更多错误信息
|
||
err = await asyncio.to_thread(stderr.read)
|
||
err_msg = err.decode().strip() if err else ""
|
||
success_msg = f"\r\n\033[31m✗ 部署启动失败\033[0m\r\n\033[90m结果: {result}\r\n错误: {err_msg}\033[0m\r\n"
|
||
|
||
await self.send(bytes_data=success_msg.encode())
|
||
|
||
except Exception as e:
|
||
error_msg = f"\033[31m✗ 部署失败: {str(e)}\033[0m\r\n"
|
||
await self.send(bytes_data=error_msg.encode())
|
||
logger.error(f"部署脚本执行失败: {e}")
|
||
|
||
async def _run_uninstall_script(self):
|
||
"""在远程主机上执行 Worker 卸载脚本
|
||
|
||
逻辑:
|
||
1. 通过服务层读取本地 worker-uninstall.sh 内容
|
||
2. 上传到远程 /tmp/xingrin_uninstall.sh 并赋予执行权限
|
||
3. 使用 exec_command 以 bash 执行脚本
|
||
4. 将执行结果摘要写回前端终端
|
||
"""
|
||
if not self.ssh_client:
|
||
return
|
||
|
||
from apps.engine.services.deploy_service import get_uninstall_script
|
||
|
||
uninstall_script = get_uninstall_script()
|
||
remote_script_path = '/tmp/xingrin_uninstall.sh'
|
||
|
||
start_msg = "\r\n\033[36m[XingRin] 正在执行 Worker 卸载...\033[0m\r\n"
|
||
await self.send(bytes_data=start_msg.encode())
|
||
|
||
try:
|
||
# 上传卸载脚本
|
||
sftp = await asyncio.to_thread(self.ssh_client.open_sftp)
|
||
with sftp.file(remote_script_path, 'w') as f:
|
||
f.write(uninstall_script)
|
||
sftp.chmod(remote_script_path, 0o755)
|
||
await asyncio.to_thread(sftp.close)
|
||
|
||
# 执行卸载脚本
|
||
cmd = f"bash {remote_script_path}"
|
||
stdin, stdout, stderr = await asyncio.to_thread(
|
||
self.ssh_client.exec_command, cmd
|
||
)
|
||
out = await asyncio.to_thread(stdout.read)
|
||
err = await asyncio.to_thread(stderr.read)
|
||
|
||
# 转换换行符为终端格式 (\n -> \r\n)
|
||
output_text = out.decode().strip().replace('\n', '\r\n') if out else ""
|
||
error_text = err.decode().strip().replace('\n', '\r\n') if err else ""
|
||
|
||
# 简单判断是否成功(退出码 + 关键字)
|
||
exit_status = stdout.channel.recv_exit_status()
|
||
if exit_status == 0:
|
||
# 卸载成功,重置状态为 pending
|
||
await sync_to_async(self.worker_service.update_status)(self.worker_id, 'pending')
|
||
# 删除 Redis 中的心跳数据
|
||
from apps.engine.services.worker_load_service import worker_load_service
|
||
worker_load_service.delete_load(self.worker_id)
|
||
# 发送状态更新到前端
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'status',
|
||
'status': 'pending' # 卸载后变为待部署状态
|
||
}))
|
||
msg = "\r\n\033[32m✓ 节点卸载完成\033[0m\r\n"
|
||
if output_text:
|
||
msg += f"\033[90m{output_text}\033[0m\r\n"
|
||
else:
|
||
msg = "\r\n\033[31m✗ Worker 卸载失败\033[0m\r\n"
|
||
if output_text:
|
||
msg += f"\033[90m输出: {output_text}\033[0m\r\n"
|
||
if error_text:
|
||
msg += f"\033[90m错误: {error_text}\033[0m\r\n"
|
||
|
||
await self.send(bytes_data=msg.encode())
|
||
|
||
except Exception as e:
|
||
error_msg = f"\033[31m✗ 卸载执行异常: {str(e)}\033[0m\r\n"
|
||
await self.send(bytes_data=error_msg.encode())
|
||
logger.error(f"卸载脚本执行失败: {e}")
|
||
|
||
async def _attach_deploy_session(self):
|
||
"""Attach 到部署会话查看进度"""
|
||
if not self.shell or not self.ssh_client:
|
||
return
|
||
|
||
session_name = f'xingrin_deploy_{self.worker_id}'
|
||
|
||
# 先静默检查会话是否存在
|
||
check_cmd = f"tmux has-session -t {session_name} 2>/dev/null && echo EXISTS || echo NOT_EXISTS"
|
||
stdin, stdout, stderr = await asyncio.to_thread(
|
||
self.ssh_client.exec_command, check_cmd
|
||
)
|
||
result = await asyncio.to_thread(stdout.read)
|
||
result = result.decode().strip()
|
||
|
||
if "EXISTS" in result:
|
||
# 会话存在,直接 attach
|
||
await asyncio.to_thread(self.shell.send, f"tmux attach -t {session_name}\n")
|
||
else:
|
||
# 会话不存在,发送提示
|
||
msg = "\r\n\033[33m没有正在运行的部署任务\033[0m\r\n\033[90m请先点击 [执行部署] 按钮启动部署\033[0m\r\n\r\n"
|
||
await self.send(bytes_data=msg.encode())
|
||
|
||
# Channel Layer 消息处理
|
||
async def terminal_output(self, event):
|
||
if self.shell:
|
||
await asyncio.to_thread(self.shell.send, event['content'])
|
||
|
||
async def deploy_status(self, event):
|
||
await self.send(text_data=json.dumps({
|
||
'type': 'status',
|
||
'status': event['status'],
|
||
'message': event.get('message', '')
|
||
}))
|