mirror of
https://github.com/yyhuni/xingrin.git
synced 2026-02-02 04:33:10 +08:00
482 lines
16 KiB
Python
482 lines
16 KiB
Python
"""
|
||
目录扫描 Flow
|
||
|
||
负责编排目录扫描的完整流程
|
||
|
||
架构:
|
||
- Flow 负责编排多个原子 Task
|
||
- 支持串行执行扫描工具(流式处理)
|
||
- 每个 Task 可独立重试
|
||
- 配置由 YAML 解析
|
||
"""
|
||
|
||
# Django 环境初始化(导入即生效)
|
||
from apps.common.prefect_django_setup import setup_django_for_prefect
|
||
|
||
from prefect import flow
|
||
|
||
import logging
|
||
import os
|
||
import subprocess
|
||
from pathlib import Path
|
||
|
||
from apps.scan.tasks.directory_scan import (
|
||
export_sites_task,
|
||
run_and_stream_save_directories_task
|
||
)
|
||
from apps.scan.handlers.scan_flow_handlers import (
|
||
on_scan_flow_running,
|
||
on_scan_flow_completed,
|
||
on_scan_flow_failed,
|
||
)
|
||
from apps.scan.utils import config_parser, build_scan_command, ensure_wordlist_local
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def calculate_directory_scan_timeout(
|
||
tool_config: dict,
|
||
base_per_word: float = 1.0,
|
||
min_timeout: int = 60,
|
||
max_timeout: int = 7200
|
||
) -> int:
|
||
"""
|
||
根据字典行数计算目录扫描超时时间
|
||
|
||
计算公式:超时时间 = 字典行数 × 每个单词基础时间
|
||
超时范围:60秒 ~ 2小时(7200秒)
|
||
|
||
Args:
|
||
tool_config: 工具配置字典,包含 wordlist 路径
|
||
base_per_word: 每个单词的基础时间(秒),默认 1.0秒
|
||
min_timeout: 最小超时时间(秒),默认 60秒
|
||
max_timeout: 最大超时时间(秒),默认 7200秒(2小时)
|
||
|
||
Returns:
|
||
int: 计算出的超时时间(秒),范围:60 ~ 7200
|
||
|
||
Example:
|
||
# 1000行字典 × 1.0秒 = 1000秒 → 限制为7200秒中的 1000秒
|
||
# 10000行字典 × 1.0秒 = 10000秒 → 限制为7200秒(最大值)
|
||
timeout = calculate_directory_scan_timeout(
|
||
tool_config={'wordlist': '/path/to/wordlist.txt'}
|
||
)
|
||
"""
|
||
try:
|
||
# 从 tool_config 中获取 wordlist 路径
|
||
wordlist_path = tool_config.get('wordlist')
|
||
if not wordlist_path:
|
||
logger.warning("工具配置中未指定 wordlist,使用默认超时: %d秒", min_timeout)
|
||
return min_timeout
|
||
|
||
# 展开用户目录(~)
|
||
wordlist_path = os.path.expanduser(wordlist_path)
|
||
|
||
# 检查文件是否存在
|
||
if not os.path.exists(wordlist_path):
|
||
logger.warning("字典文件不存在: %s,使用默认超时: %d秒", wordlist_path, min_timeout)
|
||
return min_timeout
|
||
|
||
# 使用 wc -l 快速统计字典行数
|
||
result = subprocess.run(
|
||
['wc', '-l', wordlist_path],
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
# wc -l 输出格式:行数 + 空格 + 文件名
|
||
line_count = int(result.stdout.strip().split()[0])
|
||
|
||
# 计算超时时间
|
||
timeout = int(line_count * base_per_word)
|
||
|
||
# 设置合理的下限(不再设置上限)
|
||
timeout = max(min_timeout, timeout)
|
||
|
||
logger.info(
|
||
"目录扫描超时计算 - 字典: %s, 行数: %d, 基础时间: %.3f秒/词, 计算超时: %d秒",
|
||
wordlist_path, line_count, base_per_word, timeout
|
||
)
|
||
|
||
return timeout
|
||
|
||
except subprocess.CalledProcessError as e:
|
||
logger.error("统计字典行数失败: %s", e)
|
||
# 失败时返回默认超时
|
||
return min_timeout
|
||
except (ValueError, IndexError) as e:
|
||
logger.error("解析字典行数失败: %s", e)
|
||
return min_timeout
|
||
except Exception as e:
|
||
logger.error("计算超时时间异常: %s", e)
|
||
return min_timeout
|
||
|
||
|
||
def _setup_directory_scan_directory(scan_workspace_dir: str) -> Path:
|
||
"""
|
||
创建并验证目录扫描工作目录
|
||
|
||
Args:
|
||
scan_workspace_dir: 扫描工作空间目录
|
||
|
||
Returns:
|
||
Path: 目录扫描目录路径
|
||
|
||
Raises:
|
||
RuntimeError: 目录创建或验证失败
|
||
"""
|
||
directory_scan_dir = Path(scan_workspace_dir) / 'directory_scan'
|
||
directory_scan_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
if not directory_scan_dir.is_dir():
|
||
raise RuntimeError(f"目录扫描目录创建失败: {directory_scan_dir}")
|
||
if not os.access(directory_scan_dir, os.W_OK):
|
||
raise RuntimeError(f"目录扫描目录不可写: {directory_scan_dir}")
|
||
|
||
return directory_scan_dir
|
||
|
||
|
||
def _export_site_urls(target_id: int, directory_scan_dir: Path) -> tuple[str, int]:
|
||
"""
|
||
导出目标下的所有站点 URL 到文件
|
||
|
||
Args:
|
||
target_id: 目标 ID
|
||
directory_scan_dir: 目录扫描目录
|
||
|
||
Returns:
|
||
tuple: (sites_file, site_count)
|
||
|
||
Raises:
|
||
ValueError: 站点数量为 0
|
||
"""
|
||
logger.info("Step 1: 导出目标的所有站点 URL")
|
||
|
||
sites_file = str(directory_scan_dir / 'sites.txt')
|
||
export_result = export_sites_task(
|
||
target_id=target_id,
|
||
output_file=sites_file,
|
||
batch_size=1000 # 每次读取 1000 条,优化内存占用
|
||
)
|
||
|
||
site_count = export_result['total_count']
|
||
|
||
logger.info(
|
||
"✓ 站点 URL 导出完成 - 文件: %s, 数量: %d",
|
||
export_result['output_file'],
|
||
site_count
|
||
)
|
||
|
||
if site_count == 0:
|
||
logger.warning("目标下没有站点,无法执行目录扫描")
|
||
# 不抛出异常,由上层决定如何处理
|
||
# raise ValueError("目标下没有站点,无法执行目录扫描")
|
||
|
||
return export_result['output_file'], site_count
|
||
|
||
|
||
def _run_scans_sequentially(
|
||
enabled_tools: dict,
|
||
sites_file: str,
|
||
directory_scan_dir: Path,
|
||
scan_id: int,
|
||
target_id: int,
|
||
site_count: int,
|
||
target_name: str
|
||
) -> tuple[int, int, list]:
|
||
"""
|
||
串行执行目录扫描任务(支持多工具)
|
||
|
||
Args:
|
||
enabled_tools: 启用的工具配置字典
|
||
sites_file: 站点文件路径
|
||
directory_scan_dir: 目录扫描目录
|
||
scan_id: 扫描任务 ID
|
||
target_id: 目标 ID
|
||
site_count: 站点数量
|
||
target_name: 目标名称(用于错误日志)
|
||
|
||
Returns:
|
||
tuple: (total_directories, processed_sites, failed_sites)
|
||
"""
|
||
# 读取站点列表
|
||
sites = []
|
||
with open(sites_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
site_url = line.strip()
|
||
if site_url:
|
||
sites.append(site_url)
|
||
|
||
logger.info("准备扫描 %d 个站点,使用工具: %s", len(sites), ', '.join(enabled_tools.keys()))
|
||
|
||
total_directories = 0
|
||
processed_sites_set = set() # 使用 set 避免重复计数
|
||
failed_sites = []
|
||
|
||
# 遍历每个工具
|
||
for tool_name, tool_config in enabled_tools.items():
|
||
logger.info("="*60)
|
||
logger.info("使用工具: %s", tool_name)
|
||
logger.info("="*60)
|
||
|
||
# 如果配置了 wordlist_name,则先确保本地存在对应的字典文件(含 hash 校验)
|
||
wordlist_name = tool_config.get('wordlist_name')
|
||
if wordlist_name:
|
||
try:
|
||
local_wordlist_path = ensure_wordlist_local(wordlist_name)
|
||
tool_config['wordlist'] = local_wordlist_path
|
||
except Exception as exc:
|
||
logger.error("为工具 %s 准备字典失败: %s", tool_name, exc)
|
||
# 当前工具无法执行,将所有站点视为失败,继续下一个工具
|
||
failed_sites.extend(sites)
|
||
continue
|
||
|
||
# 逐个站点执行扫描
|
||
for idx, site_url in enumerate(sites, 1):
|
||
logger.info(
|
||
"[%d/%d] 开始扫描站点: %s (工具: %s)",
|
||
idx, len(sites), site_url, tool_name
|
||
)
|
||
|
||
# 使用统一的命令构建器
|
||
try:
|
||
command = build_scan_command(
|
||
tool_name=tool_name,
|
||
scan_type='directory_scan',
|
||
command_params={
|
||
'url': site_url
|
||
},
|
||
tool_config=tool_config
|
||
)
|
||
except Exception as e:
|
||
logger.error(
|
||
"✗ [%d/%d] 构建 %s 命令失败: %s - 站点: %s",
|
||
idx, len(sites), tool_name, e, site_url
|
||
)
|
||
failed_sites.append(site_url)
|
||
continue
|
||
|
||
# 单个站点超时:从配置中获取(支持 'auto' 动态计算)
|
||
# ffuf 逐个站点扫描,timeout 就是单个站点的超时时间
|
||
site_timeout = tool_config.get('timeout', 300)
|
||
if site_timeout == 'auto':
|
||
# 动态计算超时时间(基于字典行数)
|
||
site_timeout = calculate_directory_scan_timeout(tool_config)
|
||
logger.info(f"✓ 工具 {tool_name} 动态计算 timeout: {site_timeout}秒")
|
||
|
||
# 生成日志文件路径
|
||
from datetime import datetime
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
log_file = directory_scan_dir / f"{tool_name}_{timestamp}_{idx}.log"
|
||
|
||
try:
|
||
# 直接调用 task(串行执行)
|
||
result = run_and_stream_save_directories_task(
|
||
cmd=command,
|
||
tool_name=tool_name, # 新增:工具名称
|
||
scan_id=scan_id,
|
||
target_id=target_id,
|
||
site_url=site_url,
|
||
cwd=str(directory_scan_dir),
|
||
shell=True,
|
||
batch_size=1000,
|
||
timeout=site_timeout,
|
||
log_file=str(log_file) # 新增:日志文件路径
|
||
)
|
||
|
||
total_directories += result.get('created_directories', 0)
|
||
processed_sites_set.add(site_url) # 使用 set 记录成功的站点
|
||
|
||
logger.info(
|
||
"✓ [%d/%d] 站点扫描完成: %s - 发现 %d 个目录",
|
||
idx, len(sites), site_url,
|
||
result.get('created_directories', 0)
|
||
)
|
||
|
||
except subprocess.TimeoutExpired as exc:
|
||
# 超时异常单独处理
|
||
failed_sites.append(site_url)
|
||
logger.warning(
|
||
"⚠️ [%d/%d] 站点扫描超时: %s - 超时配置: %d秒\n"
|
||
"注意:超时前已解析的目录数据已保存到数据库,但扫描未完全完成。",
|
||
idx, len(sites), site_url, site_timeout
|
||
)
|
||
except Exception as exc:
|
||
# 其他异常
|
||
failed_sites.append(site_url)
|
||
logger.error(
|
||
"✗ [%d/%d] 站点扫描失败: %s - 错误: %s",
|
||
idx, len(sites), site_url, exc
|
||
)
|
||
|
||
# 每 10 个站点输出进度
|
||
if idx % 10 == 0:
|
||
logger.info(
|
||
"进度: %d/%d (%.1f%%) - 已发现 %d 个目录",
|
||
idx, len(sites), idx/len(sites)*100, total_directories
|
||
)
|
||
|
||
# 计算成功和失败的站点数
|
||
processed_count = len(processed_sites_set)
|
||
|
||
if failed_sites:
|
||
logger.warning(
|
||
"部分站点扫描失败: %d/%d",
|
||
len(failed_sites), len(sites)
|
||
)
|
||
|
||
logger.info(
|
||
"✓ 串行目录扫描执行完成 - 成功: %d/%d, 失败: %d, 总目录数: %d",
|
||
processed_count, len(sites), len(failed_sites), total_directories
|
||
)
|
||
|
||
return total_directories, processed_count, failed_sites
|
||
|
||
|
||
@flow(
|
||
name="directory_scan",
|
||
log_prints=True,
|
||
on_running=[on_scan_flow_running],
|
||
on_completion=[on_scan_flow_completed],
|
||
on_failure=[on_scan_flow_failed],
|
||
)
|
||
def directory_scan_flow(
|
||
scan_id: int,
|
||
target_name: str,
|
||
target_id: int,
|
||
scan_workspace_dir: str,
|
||
enabled_tools: dict
|
||
) -> dict:
|
||
"""
|
||
目录扫描 Flow
|
||
|
||
主要功能:
|
||
1. 从 target 获取所有站点的 URL
|
||
2. 对每个站点 URL 执行目录扫描(支持 ffuf 等工具)
|
||
3. 流式保存扫描结果到数据库 Directory 表
|
||
|
||
工作流程:
|
||
Step 0: 创建工作目录
|
||
Step 1: 导出站点 URL 列表到文件(供扫描工具使用)
|
||
Step 2: 验证工具配置
|
||
Step 3: 串行执行扫描工具并实时保存结果
|
||
|
||
ffuf 输出字段:
|
||
- url: 发现的目录/文件 URL
|
||
- length: 响应内容长度
|
||
- status: HTTP 状态码
|
||
- words: 响应内容单词数
|
||
- lines: 响应内容行数
|
||
- content_type: 内容类型
|
||
- duration: 请求耗时
|
||
|
||
Args:
|
||
scan_id: 扫描任务 ID
|
||
target_name: 目标名称
|
||
target_id: 目标 ID
|
||
scan_workspace_dir: 扫描工作空间目录
|
||
enabled_tools: 启用的工具配置字典
|
||
|
||
Returns:
|
||
dict: {
|
||
'success': bool,
|
||
'scan_id': int,
|
||
'target': str,
|
||
'scan_workspace_dir': str,
|
||
'sites_file': str,
|
||
'site_count': int,
|
||
'total_directories': int, # 发现的总目录数
|
||
'processed_sites': int, # 成功处理的站点数
|
||
'failed_sites_count': int, # 失败的站点数
|
||
'executed_tasks': list
|
||
}
|
||
|
||
Raises:
|
||
ValueError: 参数错误
|
||
RuntimeError: 执行失败
|
||
"""
|
||
try:
|
||
logger.info(
|
||
"="*60 + "\n" +
|
||
"开始目录扫描\n" +
|
||
f" Scan ID: {scan_id}\n" +
|
||
f" Target: {target_name}\n" +
|
||
f" Workspace: {scan_workspace_dir}\n" +
|
||
"="*60
|
||
)
|
||
|
||
# 参数验证
|
||
if scan_id is None:
|
||
raise ValueError("scan_id 不能为空")
|
||
if not target_name:
|
||
raise ValueError("target_name 不能为空")
|
||
if target_id is None:
|
||
raise ValueError("target_id 不能为空")
|
||
if not scan_workspace_dir:
|
||
raise ValueError("scan_workspace_dir 不能为空")
|
||
if not enabled_tools:
|
||
raise ValueError("enabled_tools 不能为空")
|
||
|
||
# Step 0: 创建工作目录
|
||
directory_scan_dir = _setup_directory_scan_directory(scan_workspace_dir)
|
||
|
||
# Step 1: 导出站点 URL
|
||
sites_file, site_count = _export_site_urls(target_id, directory_scan_dir)
|
||
|
||
if site_count == 0:
|
||
logger.warning("目标下没有站点,跳过目录扫描")
|
||
return {
|
||
'success': True,
|
||
'scan_id': scan_id,
|
||
'target': target_name,
|
||
'scan_workspace_dir': scan_workspace_dir,
|
||
'sites_file': sites_file,
|
||
'site_count': 0,
|
||
'total_directories': 0,
|
||
'processed_sites': 0,
|
||
'failed_sites_count': 0,
|
||
'executed_tasks': ['export_sites']
|
||
}
|
||
|
||
# Step 2: 工具配置信息
|
||
logger.info("Step 2: 工具配置信息")
|
||
logger.info(
|
||
"✓ 启用工具: %s",
|
||
', '.join(enabled_tools.keys())
|
||
)
|
||
|
||
# Step 3: 串行执行扫描工具并实时保存结果
|
||
logger.info("Step 3: 串行执行扫描工具并实时保存结果")
|
||
total_directories, processed_sites, failed_sites = _run_scans_sequentially(
|
||
enabled_tools=enabled_tools,
|
||
sites_file=sites_file,
|
||
directory_scan_dir=directory_scan_dir,
|
||
scan_id=scan_id,
|
||
target_id=target_id,
|
||
site_count=site_count,
|
||
target_name=target_name
|
||
)
|
||
|
||
# 检查是否所有站点都失败
|
||
if processed_sites == 0 and site_count > 0:
|
||
logger.warning("所有站点扫描均失败 - 总站点数: %d, 失败数: %d", site_count, len(failed_sites))
|
||
# 不抛出异常,让扫描继续
|
||
|
||
logger.info("="*60 + "\n✓ 目录扫描完成\n" + "="*60)
|
||
|
||
return {
|
||
'success': True,
|
||
'scan_id': scan_id,
|
||
'target': target_name,
|
||
'scan_workspace_dir': scan_workspace_dir,
|
||
'sites_file': sites_file,
|
||
'site_count': site_count,
|
||
'total_directories': total_directories,
|
||
'processed_sites': processed_sites,
|
||
'failed_sites_count': len(failed_sites),
|
||
'executed_tasks': ['export_sites', 'run_and_stream_save_directories']
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.exception("目录扫描失败: %s", e)
|
||
raise |