Files
xingrin/backend/apps/scan/flows/url_fetch/utils.py
2025-12-12 18:04:57 +08:00

233 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
URL Fetch 共享工具函数
"""
import logging
import subprocess
import uuid
from datetime import datetime
from pathlib import Path
from apps.scan.utils import build_scan_command
logger = logging.getLogger(__name__)
def calculate_timeout_by_line_count(
tool_config: dict,
file_path: str,
base_per_time: int = 1,
) -> int:
"""
根据文件行数自动计算超时时间
Args:
tool_config: 工具配置(保留参数,未来可能用于更复杂的计算)
file_path: 输入文件路径
base_per_time: 每行的基础时间(秒)
Returns:
int: 计算出的超时时间(秒)
"""
try:
result = subprocess.run(
['wc', '-l', file_path],
capture_output=True,
text=True,
check=True,
)
line_count = int(result.stdout.strip().split()[0])
timeout = line_count * base_per_time
logger.info(
"timeout 自动计算: 文件=%s, 行数=%d, 每行时间=%d秒, timeout=%d",
file_path,
line_count,
base_per_time,
timeout,
)
return timeout
except Exception as e:
logger.warning("wc -l 计算行数失败: %s,将使用默认 timeout: 600秒", e)
return 600
def prepare_tool_execution(
tool_name: str,
tool_config: dict,
input_file: str,
input_type: str,
output_dir: Path,
scan_type: str = "url_fetch"
) -> dict:
"""
准备单个工具的执行参数
Args:
tool_name: 工具名称
tool_config: 工具配置
input_file: 输入文件路径
input_type: 输入类型domains_file 或 sites_file
output_dir: 输出目录
scan_type: 扫描类型
Returns:
dict: 执行参数,包含 command, input_file, output_file, timeout
或包含 error 键表示失败
"""
# 1. 统计输入文件行数
try:
with open(input_file, 'r') as f:
input_count = sum(1 for _ in f)
logger.info("工具 %s - 输入类型: %s, 数量: %d", tool_name, input_type, input_count)
except Exception as e:
return {"error": f"读取输入文件失败: {e}"}
# 2. 生成输出文件路径(带时间戳和短 UUID 后缀)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:4]
output_file = str(output_dir / f"{tool_name}_{timestamp}_{short_uuid}.txt")
# 3. 构建命令
command_params = {
input_type: input_file,
"output_file": output_file,
}
try:
command = build_scan_command(
tool_name=tool_name,
scan_type=scan_type,
command_params=command_params,
tool_config=tool_config,
)
except Exception as e:
logger.error("构建 %s 命令失败: %s", tool_name, e)
return {"error": f"命令构建失败: {e}"}
# 4. 计算超时时间(支持 auto 和显式整数)
raw_timeout = tool_config.get("timeout", 3600)
timeout = 3600
if isinstance(raw_timeout, str) and raw_timeout == "auto":
try:
# katana / waymore 每个站点需要更长时间
base_per_time = 360 if tool_name in ("katana", "waymore") else 1
timeout = calculate_timeout_by_line_count(
tool_config=tool_config,
file_path=input_file,
base_per_time=base_per_time,
)
except Exception as e:
logger.warning(
"工具 %s 自动计算 timeout 失败,将使用默认 3600 秒: %s",
tool_name,
e,
)
timeout = 3600
else:
try:
timeout = int(raw_timeout)
except (TypeError, ValueError):
logger.warning(
"工具 %s 的 timeout 配置无效(%s),将使用默认 3600 秒",
tool_name,
raw_timeout,
)
timeout = 3600
# 5. 返回执行参数
return {
"command": command,
"input_file": input_file,
"input_type": input_type,
"output_file": output_file,
"timeout": timeout,
}
def run_tools_parallel(
tools: dict,
input_file: str,
input_type: str,
output_dir: Path
) -> tuple[list, list, list]:
"""
并行执行工具列表
Args:
tools: 工具配置字典 {tool_name: tool_config}
input_file: 输入文件路径
input_type: 输入类型
output_dir: 输出目录
Returns:
tuple: (result_files, failed_tools, successful_tool_names)
"""
from apps.scan.tasks.url_fetch import run_url_fetcher_task
futures: dict[str, object] = {}
failed_tools: list[dict] = []
# 提交所有工具的并行任务
for tool_name, tool_config in tools.items():
exec_params = prepare_tool_execution(
tool_name=tool_name,
tool_config=tool_config,
input_file=input_file,
input_type=input_type,
output_dir=output_dir,
)
if "error" in exec_params:
failed_tools.append({"tool": tool_name, "reason": exec_params["error"]})
continue
logger.info(
"提交任务 - 工具: %s, 输入: %s, 超时: %d",
tool_name,
input_type,
exec_params["timeout"],
)
# 提交并行任务
future = run_url_fetcher_task.submit(
tool_name=tool_name,
command=exec_params["command"],
timeout=exec_params["timeout"],
output_file=exec_params["output_file"],
)
futures[tool_name] = future
# 收集执行结果
result_files = []
for tool_name, future in futures.items():
try:
result = future.result()
if result and result['success']:
result_files.append(result['output_file'])
logger.info(
"✓ 工具 %s 执行成功 - 发现 URL: %d",
tool_name, result['url_count']
)
else:
failed_tools.append({
'tool': tool_name,
'reason': '未生成结果或无有效URL'
})
logger.warning("⚠️ 工具 %s 未生成有效结果", tool_name)
except Exception as e:
failed_tools.append({
'tool': tool_name,
'reason': str(e)
})
logger.warning("⚠️ 工具 %s 执行失败: %s", tool_name, e)
# 计算成功的工具列表
failed_tool_names = [f['tool'] for f in failed_tools]
successful_tool_names = [
name for name in tools.keys()
if name not in failed_tool_names
]
return result_files, failed_tools, successful_tool_names