Files
xingrin/backend/apps/scan/flows/url_fetch/utils.py

233 lines
6.8 KiB
Python
Raw Normal View History

2025-12-12 18:04:57 +08:00
"""
URL Fetch 共享工具函数
"""
import logging
import subprocess
import uuid
from datetime import datetime
from pathlib import Path
from apps.scan.utils import build_scan_command
logger = logging.getLogger(__name__)
def calculate_timeout_by_line_count(
tool_config: dict,
file_path: str,
base_per_time: int = 1,
) -> int:
"""
根据文件行数自动计算超时时间
Args:
tool_config: 工具配置保留参数未来可能用于更复杂的计算
file_path: 输入文件路径
base_per_time: 每行的基础时间
Returns:
int: 计算出的超时时间
"""
try:
result = subprocess.run(
['wc', '-l', file_path],
capture_output=True,
text=True,
check=True,
)
line_count = int(result.stdout.strip().split()[0])
timeout = line_count * base_per_time
logger.info(
"timeout 自动计算: 文件=%s, 行数=%d, 每行时间=%d秒, timeout=%d",
file_path,
line_count,
base_per_time,
timeout,
)
return timeout
except Exception as e:
logger.warning("wc -l 计算行数失败: %s,将使用默认 timeout: 600秒", e)
return 600
def prepare_tool_execution(
tool_name: str,
tool_config: dict,
input_file: str,
input_type: str,
output_dir: Path,
scan_type: str = "url_fetch"
) -> dict:
"""
准备单个工具的执行参数
Args:
tool_name: 工具名称
tool_config: 工具配置
input_file: 输入文件路径
input_type: 输入类型domains_file sites_file
output_dir: 输出目录
scan_type: 扫描类型
Returns:
dict: 执行参数包含 command, input_file, output_file, timeout
或包含 error 键表示失败
"""
# 1. 统计输入文件行数
try:
with open(input_file, 'r') as f:
input_count = sum(1 for _ in f)
logger.info("工具 %s - 输入类型: %s, 数量: %d", tool_name, input_type, input_count)
except Exception as e:
return {"error": f"读取输入文件失败: {e}"}
# 2. 生成输出文件路径(带时间戳和短 UUID 后缀)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:4]
output_file = str(output_dir / f"{tool_name}_{timestamp}_{short_uuid}.txt")
# 3. 构建命令
command_params = {
input_type: input_file,
"output_file": output_file,
}
try:
command = build_scan_command(
tool_name=tool_name,
scan_type=scan_type,
command_params=command_params,
tool_config=tool_config,
)
except Exception as e:
logger.error("构建 %s 命令失败: %s", tool_name, e)
return {"error": f"命令构建失败: {e}"}
# 4. 计算超时时间(支持 auto 和显式整数)
raw_timeout = tool_config.get("timeout", 3600)
timeout = 3600
if isinstance(raw_timeout, str) and raw_timeout == "auto":
try:
# katana / waymore 每个站点需要更长时间
base_per_time = 360 if tool_name in ("katana", "waymore") else 1
timeout = calculate_timeout_by_line_count(
tool_config=tool_config,
file_path=input_file,
base_per_time=base_per_time,
)
except Exception as e:
logger.warning(
"工具 %s 自动计算 timeout 失败,将使用默认 3600 秒: %s",
tool_name,
e,
)
timeout = 3600
else:
try:
timeout = int(raw_timeout)
except (TypeError, ValueError):
logger.warning(
"工具 %s 的 timeout 配置无效(%s),将使用默认 3600 秒",
tool_name,
raw_timeout,
)
timeout = 3600
# 5. 返回执行参数
return {
"command": command,
"input_file": input_file,
"input_type": input_type,
"output_file": output_file,
"timeout": timeout,
}
def run_tools_parallel(
tools: dict,
input_file: str,
input_type: str,
output_dir: Path
) -> tuple[list, list, list]:
"""
并行执行工具列表
Args:
tools: 工具配置字典 {tool_name: tool_config}
input_file: 输入文件路径
input_type: 输入类型
output_dir: 输出目录
Returns:
tuple: (result_files, failed_tools, successful_tool_names)
"""
from apps.scan.tasks.url_fetch import run_url_fetcher_task
futures: dict[str, object] = {}
failed_tools: list[dict] = []
# 提交所有工具的并行任务
for tool_name, tool_config in tools.items():
exec_params = prepare_tool_execution(
tool_name=tool_name,
tool_config=tool_config,
input_file=input_file,
input_type=input_type,
output_dir=output_dir,
)
if "error" in exec_params:
failed_tools.append({"tool": tool_name, "reason": exec_params["error"]})
continue
logger.info(
"提交任务 - 工具: %s, 输入: %s, 超时: %d",
tool_name,
input_type,
exec_params["timeout"],
)
# 提交并行任务
future = run_url_fetcher_task.submit(
tool_name=tool_name,
command=exec_params["command"],
timeout=exec_params["timeout"],
output_file=exec_params["output_file"],
)
futures[tool_name] = future
# 收集执行结果
result_files = []
for tool_name, future in futures.items():
try:
result = future.result()
if result and result['success']:
result_files.append(result['output_file'])
logger.info(
"✓ 工具 %s 执行成功 - 发现 URL: %d",
tool_name, result['url_count']
)
else:
failed_tools.append({
'tool': tool_name,
'reason': '未生成结果或无有效URL'
})
logger.warning("⚠️ 工具 %s 未生成有效结果", tool_name)
except Exception as e:
failed_tools.append({
'tool': tool_name,
'reason': str(e)
})
logger.warning("⚠️ 工具 %s 执行失败: %s", tool_name, e)
# 计算成功的工具列表
failed_tool_names = [f['tool'] for f in failed_tools]
successful_tool_names = [
name for name in tools.keys()
if name not in failed_tool_names
]
return result_files, failed_tools, successful_tool_names