backend/apps/scan/flows/url_fetch/utils.py

"""
URL Fetch 共享工具函数
"""

import logging
import subprocess
import uuid
from datetime import datetime
from pathlib import Path

from apps.scan.utils import build_scan_command

logger = logging.getLogger(__name__)


def calculate_timeout_by_line_count(
    tool_config: dict,
    file_path: str,
    base_per_time: int = 1,
) -> int:
    """
    根据文件行数自动计算超时时间
    
    Args:
        tool_config: 工具配置（保留参数，未来可能用于更复杂的计算）
        file_path: 输入文件路径
        base_per_time: 每行的基础时间（秒）
        
    Returns:
        int: 计算出的超时时间（秒）
    """
    try:
        result = subprocess.run(
            ['wc', '-l', file_path],
            capture_output=True,
            text=True,
            check=True,
        )
        line_count = int(result.stdout.strip().split()[0])
        timeout = line_count * base_per_time
        logger.info(
            "timeout 自动计算: 文件=%s, 行数=%d, 每行时间=%d秒, timeout=%d秒",
            file_path,
            line_count,
            base_per_time,
            timeout,
        )
        return timeout
    except Exception as e:
        logger.warning("wc -l 计算行数失败: %s，将使用默认 timeout: 600秒", e)
        return 600


def prepare_tool_execution(
    tool_name: str,
    tool_config: dict,
    input_file: str,
    input_type: str,
    output_dir: Path,
    scan_type: str = "url_fetch"
) -> dict:
    """
    准备单个工具的执行参数
    
    Args:
        tool_name: 工具名称
        tool_config: 工具配置
        input_file: 输入文件路径
        input_type: 输入类型（domains_file 或 sites_file）
        output_dir: 输出目录
        scan_type: 扫描类型
        
    Returns:
        dict: 执行参数，包含 command, input_file, output_file, timeout
              或包含 error 键表示失败
    """
    # 1. 统计输入文件行数
    try:
        with open(input_file, 'r') as f:
            input_count = sum(1 for _ in f)
        logger.info("工具 %s - 输入类型: %s, 数量: %d", tool_name, input_type, input_count)
    except Exception as e:
        return {"error": f"读取输入文件失败: {e}"}

    # 2. 生成输出文件路径（带时间戳和短 UUID 后缀）
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    short_uuid = uuid.uuid4().hex[:4]
    output_file = str(output_dir / f"{tool_name}_{timestamp}_{short_uuid}.txt")

    # 3. 构建命令
    command_params = {
        input_type: input_file,
        "output_file": output_file,
    }

    try:
        command = build_scan_command(
            tool_name=tool_name,
            scan_type=scan_type,
            command_params=command_params,
            tool_config=tool_config,
        )
    except Exception as e:
        logger.error("构建 %s 命令失败: %s", tool_name, e)
        return {"error": f"命令构建失败: {e}"}

    # 4. 计算超时时间（支持 auto 和显式整数）
    raw_timeout = tool_config.get("timeout", 3600)
    timeout = 3600
    
    if isinstance(raw_timeout, str) and raw_timeout == "auto":
        try:
            # katana / waymore 每个站点需要更长时间
            base_per_time = 360 if tool_name in ("katana", "waymore") else 1
            timeout = calculate_timeout_by_line_count(
                tool_config=tool_config,
                file_path=input_file,
                base_per_time=base_per_time,
            )
        except Exception as e:
            logger.warning(
                "工具 %s 自动计算 timeout 失败，将使用默认 3600 秒: %s",
                tool_name,
                e,
            )
            timeout = 3600
    else:
        try:
            timeout = int(raw_timeout)
        except (TypeError, ValueError):
            logger.warning(
                "工具 %s 的 timeout 配置无效(%s)，将使用默认 3600 秒",
                tool_name,
                raw_timeout,
            )
            timeout = 3600

    # 5. 返回执行参数
    return {
        "command": command,
        "input_file": input_file,
        "input_type": input_type,
        "output_file": output_file,
        "timeout": timeout,
    }


def run_tools_parallel(
    tools: dict,
    input_file: str,
    input_type: str,
    output_dir: Path
) -> tuple[list, list, list]:
    """
    并行执行工具列表
    
    Args:
        tools: 工具配置字典 {tool_name: tool_config}
        input_file: 输入文件路径
        input_type: 输入类型
        output_dir: 输出目录
        
    Returns:
        tuple: (result_files, failed_tools, successful_tool_names)
    """
    from apps.scan.tasks.url_fetch import run_url_fetcher_task

    futures: dict[str, object] = {}
    failed_tools: list[dict] = []

    # 提交所有工具的并行任务
    for tool_name, tool_config in tools.items():
        exec_params = prepare_tool_execution(
            tool_name=tool_name,
            tool_config=tool_config,
            input_file=input_file,
            input_type=input_type,
            output_dir=output_dir,
        )

        if "error" in exec_params:
            failed_tools.append({"tool": tool_name, "reason": exec_params["error"]})
            continue

        logger.info(
            "提交任务 - 工具: %s, 输入: %s, 超时: %d秒",
            tool_name,
            input_type,
            exec_params["timeout"],
        )

        # 提交并行任务
        future = run_url_fetcher_task.submit(
            tool_name=tool_name,
            command=exec_params["command"],
            timeout=exec_params["timeout"],
            output_file=exec_params["output_file"],
        )
        futures[tool_name] = future

    # 收集执行结果
    result_files = []
    for tool_name, future in futures.items():
        try:
            result = future.result()
            if result and result['success']:
                result_files.append(result['output_file'])
                logger.info(
                    "✓ 工具 %s 执行成功 - 发现 URL: %d",
                    tool_name, result['url_count']
                )
            else:
                failed_tools.append({
                    'tool': tool_name,
                    'reason': '未生成结果或无有效URL'
                })
                logger.warning("⚠️ 工具 %s 未生成有效结果", tool_name)
        except Exception as e:
            failed_tools.append({
                'tool': tool_name,
                'reason': str(e)
            })
            logger.warning("⚠️ 工具 %s 执行失败: %s", tool_name, e)

    # 计算成功的工具列表
    failed_tool_names = [f['tool'] for f in failed_tools]
    successful_tool_names = [
        name for name in tools.keys()
        if name not in failed_tool_names
    ]

    return result_files, failed_tools, successful_tool_names