mirror of
https://github.com/yyhuni/xingrin.git
synced 2026-02-02 04:33:10 +08:00
495 lines
16 KiB
Python
495 lines
16 KiB
Python
|
||
"""
|
||
站点扫描 Flow
|
||
|
||
负责编排站点扫描的完整流程
|
||
|
||
架构:
|
||
- Flow 负责编排多个原子 Task
|
||
- 支持串行执行扫描工具(流式处理)
|
||
- 每个 Task 可独立重试
|
||
- 配置由 YAML 解析
|
||
"""
|
||
|
||
# Django 环境初始化(导入即生效)
|
||
from apps.common.prefect_django_setup import setup_django_for_prefect
|
||
|
||
import logging
|
||
import os
|
||
import subprocess
|
||
from pathlib import Path
|
||
from typing import Callable
|
||
from prefect import flow
|
||
from apps.scan.tasks.site_scan import export_site_urls_task, run_and_stream_save_websites_task
|
||
from apps.scan.handlers.scan_flow_handlers import (
|
||
on_scan_flow_running,
|
||
on_scan_flow_completed,
|
||
on_scan_flow_failed,
|
||
)
|
||
from apps.scan.utils import config_parser, build_scan_command
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def calculate_timeout_by_line_count(
|
||
tool_config: dict,
|
||
file_path: str,
|
||
base_per_time: int = 1
|
||
) -> int:
|
||
"""
|
||
根据文件行数计算 timeout
|
||
|
||
使用 wc -l 统计文件行数,根据行数和每行基础时间计算 timeout
|
||
|
||
Args:
|
||
tool_config: 工具配置字典(此函数未使用,但保持接口一致性)
|
||
file_path: 要统计行数的文件路径
|
||
base_per_time: 每行的基础时间(秒),默认1秒
|
||
|
||
Returns:
|
||
int: 计算出的超时时间(秒)
|
||
|
||
Example:
|
||
timeout = calculate_timeout_by_line_count(
|
||
tool_config={},
|
||
file_path='/path/to/urls.txt',
|
||
base_per_time=2
|
||
)
|
||
"""
|
||
try:
|
||
# 使用 wc -l 快速统计行数
|
||
result = subprocess.run(
|
||
['wc', '-l', file_path],
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
# wc -l 输出格式:行数 + 空格 + 文件名
|
||
line_count = int(result.stdout.strip().split()[0])
|
||
|
||
# 计算 timeout:行数 × 每行基础时间
|
||
timeout = line_count * base_per_time
|
||
|
||
logger.info(
|
||
f"timeout 自动计算: 文件={file_path}, "
|
||
f"行数={line_count}, 每行时间={base_per_time}秒, timeout={timeout}秒"
|
||
)
|
||
|
||
return timeout
|
||
|
||
except Exception as e:
|
||
# 如果 wc -l 失败,使用默认值
|
||
logger.warning(f"wc -l 计算行数失败: {e},使用默认 timeout: 600秒")
|
||
return 600
|
||
|
||
|
||
def _setup_site_scan_directory(scan_workspace_dir: str) -> Path:
|
||
"""
|
||
创建并验证站点扫描工作目录
|
||
|
||
Args:
|
||
scan_workspace_dir: 扫描工作空间目录
|
||
|
||
Returns:
|
||
Path: 站点扫描目录路径
|
||
|
||
Raises:
|
||
RuntimeError: 目录创建或验证失败
|
||
"""
|
||
site_scan_dir = Path(scan_workspace_dir) / 'site_scan'
|
||
site_scan_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
if not site_scan_dir.is_dir():
|
||
raise RuntimeError(f"站点扫描目录创建失败: {site_scan_dir}")
|
||
if not os.access(site_scan_dir, os.W_OK):
|
||
raise RuntimeError(f"站点扫描目录不可写: {site_scan_dir}")
|
||
|
||
return site_scan_dir
|
||
|
||
|
||
def _export_site_urls(target_id: int, site_scan_dir: Path) -> tuple[str, int, int]:
|
||
"""
|
||
导出站点 URL 到文件
|
||
|
||
Args:
|
||
target_id: 目标 ID
|
||
site_scan_dir: 站点扫描目录
|
||
|
||
Returns:
|
||
tuple: (urls_file, total_urls, association_count)
|
||
|
||
Raises:
|
||
ValueError: URL 数量为 0
|
||
"""
|
||
logger.info("Step 1: 导出站点URL列表")
|
||
|
||
urls_file = str(site_scan_dir / 'site_urls.txt')
|
||
export_result = export_site_urls_task(
|
||
target_id=target_id,
|
||
output_file=urls_file,
|
||
batch_size=1000 # 每次处理1000个子域名
|
||
)
|
||
|
||
total_urls = export_result['total_urls']
|
||
association_count = export_result['association_count'] # 主机端口关联数
|
||
|
||
logger.info(
|
||
"✓ 站点URL导出完成 - 文件: %s, URL数量: %d, 关联数: %d",
|
||
export_result['output_file'],
|
||
total_urls,
|
||
association_count
|
||
)
|
||
|
||
if total_urls == 0:
|
||
logger.warning("目标下没有可用的站点URL,无法执行站点扫描")
|
||
# 不抛出异常,由上层决定如何处理
|
||
# raise ValueError("目标下没有可用的站点URL,无法执行站点扫描")
|
||
|
||
return export_result['output_file'], total_urls, association_count
|
||
|
||
|
||
def _run_scans_sequentially(
|
||
enabled_tools: dict,
|
||
urls_file: str,
|
||
total_urls: int,
|
||
site_scan_dir: Path,
|
||
scan_id: int,
|
||
target_id: int,
|
||
target_name: str
|
||
) -> tuple[dict, int, list, list]:
|
||
"""
|
||
串行执行站点扫描任务
|
||
|
||
Args:
|
||
enabled_tools: 已启用的工具配置字典
|
||
urls_file: URL 文件路径
|
||
total_urls: URL 总数
|
||
site_scan_dir: 站点扫描目录
|
||
scan_id: 扫描任务 ID
|
||
target_id: 目标 ID
|
||
target_name: 目标名称(用于错误日志)
|
||
|
||
Returns:
|
||
tuple: (tool_stats, processed_records, successful_tool_names, failed_tools)
|
||
|
||
Raises:
|
||
RuntimeError: 所有工具均失败
|
||
"""
|
||
tool_stats = {}
|
||
processed_records = 0
|
||
failed_tools = []
|
||
|
||
for tool_name, tool_config in enabled_tools.items():
|
||
# 1. 构建完整命令(变量替换)
|
||
try:
|
||
command = build_scan_command(
|
||
tool_name=tool_name,
|
||
scan_type='site_scan',
|
||
command_params={
|
||
'url_file': urls_file
|
||
},
|
||
tool_config=tool_config
|
||
)
|
||
except Exception as e:
|
||
reason = f"命令构建失败: {str(e)}"
|
||
logger.error(f"构建 {tool_name} 命令失败: {e}")
|
||
failed_tools.append({'tool': tool_name, 'reason': reason})
|
||
continue
|
||
|
||
# 2. 获取超时时间(支持 'auto' 动态计算)
|
||
config_timeout = tool_config.get('timeout', 300)
|
||
if config_timeout == 'auto':
|
||
# 动态计算超时时间
|
||
timeout = calculate_timeout_by_line_count(tool_config, urls_file, base_per_time=1)
|
||
logger.info(f"✓ 工具 {tool_name} 动态计算 timeout: {timeout}秒")
|
||
else:
|
||
# 使用配置的超时时间和动态计算的较大值
|
||
dynamic_timeout = calculate_timeout_by_line_count(tool_config, urls_file, base_per_time=1)
|
||
timeout = max(dynamic_timeout, config_timeout)
|
||
|
||
# 2.1 生成日志文件路径(类似端口扫描)
|
||
from datetime import datetime
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
log_file = site_scan_dir / f"{tool_name}_{timestamp}.log"
|
||
|
||
logger.info(
|
||
"开始执行 %s 站点扫描 - URL数: %d, 最终超时: %ds",
|
||
tool_name, total_urls, timeout
|
||
)
|
||
|
||
# 3. 执行扫描任务
|
||
try:
|
||
# 流式执行扫描并实时保存结果
|
||
result = run_and_stream_save_websites_task(
|
||
cmd=command,
|
||
tool_name=tool_name, # 新增:工具名称
|
||
scan_id=scan_id,
|
||
target_id=target_id,
|
||
cwd=str(site_scan_dir),
|
||
shell=True,
|
||
batch_size=1000,
|
||
timeout=timeout,
|
||
log_file=str(log_file) # 新增:日志文件路径
|
||
)
|
||
|
||
tool_stats[tool_name] = {
|
||
'command': command,
|
||
'result': result,
|
||
'timeout': timeout
|
||
}
|
||
processed_records += result.get('processed_records', 0)
|
||
|
||
logger.info(
|
||
"✓ 工具 %s 流式处理完成 - 处理记录: %d, 创建站点: %d, 跳过: %d",
|
||
tool_name,
|
||
result.get('processed_records', 0),
|
||
result.get('created_websites', 0),
|
||
result.get('skipped_no_subdomain', 0) + result.get('skipped_failed', 0)
|
||
)
|
||
|
||
except subprocess.TimeoutExpired as exc:
|
||
# 超时异常单独处理
|
||
reason = f"执行超时(配置: {timeout}秒)"
|
||
failed_tools.append({'tool': tool_name, 'reason': reason})
|
||
logger.warning(
|
||
"⚠️ 工具 %s 执行超时 - 超时配置: %d秒\n"
|
||
"注意:超时前已解析的站点数据已保存到数据库,但扫描未完全完成。",
|
||
tool_name, timeout
|
||
)
|
||
except Exception as exc:
|
||
# 其他异常
|
||
failed_tools.append({'tool': tool_name, 'reason': str(exc)})
|
||
logger.error("工具 %s 执行失败: %s", tool_name, exc, exc_info=True)
|
||
|
||
if failed_tools:
|
||
logger.warning(
|
||
"以下扫描工具执行失败: %s",
|
||
', '.join([f['tool'] for f in failed_tools])
|
||
)
|
||
|
||
if not tool_stats:
|
||
error_details = "; ".join([f"{f['tool']}: {f['reason']}" for f in failed_tools])
|
||
logger.warning("所有站点扫描工具均失败 - 目标: %s, 失败工具: %s", target_name, error_details)
|
||
# 返回空结果,不抛出异常,让扫描继续
|
||
return {}, 0, [], failed_tools
|
||
|
||
# 动态计算成功的工具列表
|
||
successful_tool_names = [name for name in enabled_tools.keys()
|
||
if name not in [f['tool'] for f in failed_tools]]
|
||
|
||
logger.info(
|
||
"✓ 串行站点扫描执行完成 - 成功: %d/%d (成功: %s, 失败: %s)",
|
||
len(tool_stats), len(enabled_tools),
|
||
', '.join(successful_tool_names) if successful_tool_names else '无',
|
||
', '.join([f['tool'] for f in failed_tools]) if failed_tools else '无'
|
||
)
|
||
|
||
return tool_stats, processed_records, successful_tool_names, failed_tools
|
||
|
||
|
||
def calculate_timeout(url_count: int, base: int = 600, per_url: int = 1) -> int:
|
||
"""
|
||
根据 URL 数量动态计算扫描超时时间
|
||
|
||
规则:
|
||
- 基础时间:默认 600 秒(10 分钟)
|
||
- 每个 URL 额外增加:默认 1 秒
|
||
|
||
Args:
|
||
url_count: URL 数量,必须为正整数
|
||
base: 基础超时时间(秒),默认 600
|
||
per_url: 每个 URL 增加的时间(秒),默认 1
|
||
|
||
Returns:
|
||
int: 计算得到的超时时间(秒),不超过 max_timeout
|
||
|
||
Raises:
|
||
ValueError: 当 url_count 为负数或 0 时抛出异常
|
||
"""
|
||
if url_count < 0:
|
||
raise ValueError(f"URL数量不能为负数: {url_count}")
|
||
if url_count == 0:
|
||
raise ValueError("URL数量不能为0")
|
||
|
||
timeout = base + int(url_count * per_url)
|
||
|
||
# 不设置上限,由调用方根据需要控制
|
||
return timeout
|
||
|
||
|
||
@flow(
|
||
name="site_scan",
|
||
log_prints=True,
|
||
on_running=[on_scan_flow_running],
|
||
on_completion=[on_scan_flow_completed],
|
||
on_failure=[on_scan_flow_failed],
|
||
)
|
||
def site_scan_flow(
|
||
scan_id: int,
|
||
target_name: str,
|
||
target_id: int,
|
||
scan_workspace_dir: str,
|
||
enabled_tools: dict
|
||
) -> dict:
|
||
"""
|
||
站点扫描 Flow
|
||
|
||
主要功能:
|
||
1. 从target获取所有子域名与其对应的端口号,拼接成URL写入文件
|
||
2. 用httpx进行批量请求并实时保存到数据库(流式处理)
|
||
|
||
工作流程:
|
||
Step 0: 创建工作目录
|
||
Step 1: 导出站点 URL 列表
|
||
Step 2: 解析配置,获取启用的工具
|
||
Step 3: 串行执行扫描工具并实时保存结果
|
||
|
||
Args:
|
||
scan_id: 扫描任务 ID
|
||
target_name: 目标名称
|
||
target_id: 目标 ID
|
||
scan_workspace_dir: 扫描工作空间目录
|
||
enabled_tools: 启用的工具配置字典
|
||
|
||
Returns:
|
||
dict: {
|
||
'success': bool,
|
||
'scan_id': int,
|
||
'target': str,
|
||
'scan_workspace_dir': str,
|
||
'urls_file': str,
|
||
'total_urls': int,
|
||
'association_count': int,
|
||
'processed_records': int,
|
||
'created_websites': int,
|
||
'skipped_no_subdomain': int,
|
||
'skipped_failed': int,
|
||
'executed_tasks': list,
|
||
'tool_stats': {
|
||
'total': int,
|
||
'successful': int,
|
||
'failed': int,
|
||
'successful_tools': list[str],
|
||
'failed_tools': list[dict]
|
||
}
|
||
}
|
||
|
||
Raises:
|
||
ValueError: 配置错误
|
||
RuntimeError: 执行失败
|
||
"""
|
||
try:
|
||
logger.info(
|
||
"="*60 + "\n" +
|
||
"开始站点扫描\n" +
|
||
f" Scan ID: {scan_id}\n" +
|
||
f" Target: {target_name}\n" +
|
||
f" Workspace: {scan_workspace_dir}\n" +
|
||
"="*60
|
||
)
|
||
|
||
# 参数验证
|
||
if scan_id is None:
|
||
raise ValueError("scan_id 不能为空")
|
||
if not target_name:
|
||
raise ValueError("target_name 不能为空")
|
||
if target_id is None:
|
||
raise ValueError("target_id 不能为空")
|
||
if not scan_workspace_dir:
|
||
raise ValueError("scan_workspace_dir 不能为空")
|
||
|
||
# Step 0: 创建工作目录
|
||
site_scan_dir = _setup_site_scan_directory(scan_workspace_dir)
|
||
|
||
# Step 1: 导出站点 URL
|
||
urls_file, total_urls, association_count = _export_site_urls(
|
||
target_id, site_scan_dir
|
||
)
|
||
|
||
if total_urls == 0:
|
||
logger.warning("目标下没有可用的站点URL,跳过站点扫描")
|
||
return {
|
||
'success': True,
|
||
'scan_id': scan_id,
|
||
'target': target_name,
|
||
'scan_workspace_dir': scan_workspace_dir,
|
||
'urls_file': urls_file,
|
||
'total_urls': 0,
|
||
'association_count': association_count,
|
||
'processed_records': 0,
|
||
'created_websites': 0,
|
||
'skipped_no_subdomain': 0,
|
||
'skipped_failed': 0,
|
||
'executed_tasks': ['export_site_urls'],
|
||
'tool_stats': {
|
||
'total': 0,
|
||
'successful': 0,
|
||
'failed': 0,
|
||
'successful_tools': [],
|
||
'failed_tools': [],
|
||
'details': {}
|
||
}
|
||
}
|
||
|
||
# Step 2: 工具配置信息
|
||
logger.info("Step 2: 工具配置信息")
|
||
logger.info(
|
||
"✓ 启用工具: %s",
|
||
', '.join(enabled_tools.keys())
|
||
)
|
||
|
||
# Step 3: 串行执行扫描工具
|
||
logger.info("Step 3: 串行执行扫描工具并实时保存结果")
|
||
tool_stats, processed_records, successful_tool_names, failed_tools = _run_scans_sequentially(
|
||
enabled_tools=enabled_tools,
|
||
urls_file=urls_file,
|
||
total_urls=total_urls,
|
||
site_scan_dir=site_scan_dir,
|
||
scan_id=scan_id,
|
||
target_id=target_id,
|
||
target_name=target_name
|
||
)
|
||
|
||
logger.info("="*60 + "\n✓ 站点扫描完成\n" + "="*60)
|
||
|
||
# 动态生成已执行的任务列表
|
||
executed_tasks = ['export_site_urls', 'parse_config']
|
||
executed_tasks.extend([f'run_and_stream_save_websites ({tool})' for tool in tool_stats.keys()])
|
||
|
||
# 汇总所有工具的结果
|
||
total_created = sum(stats['result'].get('created_websites', 0) for stats in tool_stats.values())
|
||
total_skipped_no_subdomain = sum(stats['result'].get('skipped_no_subdomain', 0) for stats in tool_stats.values())
|
||
total_skipped_failed = sum(stats['result'].get('skipped_failed', 0) for stats in tool_stats.values())
|
||
|
||
return {
|
||
'success': True,
|
||
'scan_id': scan_id,
|
||
'target': target_name,
|
||
'scan_workspace_dir': scan_workspace_dir,
|
||
'urls_file': urls_file,
|
||
'total_urls': total_urls,
|
||
'association_count': association_count,
|
||
'processed_records': processed_records,
|
||
'created_websites': total_created,
|
||
'skipped_no_subdomain': total_skipped_no_subdomain,
|
||
'skipped_failed': total_skipped_failed,
|
||
'executed_tasks': executed_tasks,
|
||
'tool_stats': {
|
||
'total': len(enabled_tools),
|
||
'successful': len(successful_tool_names),
|
||
'failed': len(failed_tools),
|
||
'successful_tools': successful_tool_names,
|
||
'failed_tools': failed_tools,
|
||
'details': tool_stats
|
||
}
|
||
}
|
||
|
||
except ValueError as e:
|
||
logger.error("配置错误: %s", e)
|
||
raise
|
||
except RuntimeError as e:
|
||
logger.error("运行时错误: %s", e)
|
||
raise
|
||
except Exception as e:
|
||
logger.exception("站点扫描失败: %s", e)
|
||
raise |