xingrin/backend/apps/scan/flows/subdomain_discovery_flow.py

"""
子域名发现扫描 Flow

负责编排子域名发现扫描的完整流程

架构：
- Flow 负责编排多个原子 Task
- 支持并行执行扫描工具
- 每个 Task 可独立重试
- 配置由 YAML 解析

增强流程（4 阶段）：
    Stage 1: 被动收集（并行） - 必选
    Stage 2: 字典爆破（可选） - 子域名字典爆破
    Stage 3: 变异生成 + 验证（可选） - dnsgen + 通用存活验证
    Stage 4: DNS 存活验证（可选） - 通用存活验证

各阶段可灵活开关，最终结果根据实际执行的阶段动态决定
"""

# Django 环境初始化（导入即生效）
from apps.common.prefect_django_setup import setup_django_for_prefect

from prefect import flow
from pathlib import Path
import logging
import os
from apps.scan.handlers.scan_flow_handlers import (
    on_scan_flow_running,
    on_scan_flow_completed,
    on_scan_flow_failed,
)
from apps.scan.utils import build_scan_command, ensure_wordlist_local
from apps.engine.services.wordlist_service import WordlistService
from apps.common.normalizer import normalize_domain
from apps.common.validators import validate_domain
from datetime import datetime
import uuid
import subprocess

logger = logging.getLogger(__name__)


def _setup_subdomain_directory(scan_workspace_dir: str) -> Path:
    """
    创建并验证子域名扫描工作目录

    Args:
        scan_workspace_dir: 扫描工作空间目录

    Returns:
        Path: 子域名扫描目录路径

    Raises:
        RuntimeError: 目录创建或验证失败
    """
    result_dir = Path(scan_workspace_dir) / 'subdomain_discovery'
    result_dir.mkdir(parents=True, exist_ok=True)

    if not result_dir.is_dir():
        raise RuntimeError(f"子域名扫描目录创建失败: {result_dir}")
    if not os.access(result_dir, os.W_OK):
        raise RuntimeError(f"子域名扫描目录不可写: {result_dir}")

    return result_dir


def _validate_and_normalize_target(target_name: str) -> str:
    """
    验证并规范化目标域名

    Args:
        target_name: 原始目标域名

    Returns:
        str: 规范化后的域名

    Raises:
        ValueError: 域名无效时抛出异常

    Example:
        >>> _validate_and_normalize_target('EXAMPLE.COM')
        'example.com'
        >>> _validate_and_normalize_target('http://example.com')
        'example.com'
    """
    try:
        normalized_target = normalize_domain(target_name)
        validate_domain(normalized_target)
        logger.debug("域名验证通过: %s -> %s", target_name, normalized_target)
        return normalized_target
    except ValueError as e:
        error_msg = f"无效的目标域名: {target_name} - {e}"
        logger.error(error_msg)
        raise ValueError(error_msg) from e


def _run_scans_parallel(
    enabled_tools: dict,
    domain_name: str,
    result_dir: Path
) -> tuple[list, list, list]:
    """
    并行运行所有启用的子域名扫描工具

    Args:
        enabled_tools: 启用的工具配置字典 {'tool_name': {'timeout': 600, ...}}
        domain_name: 目标域名
        result_dir: 结果输出目录

    Returns:
        tuple: (result_files, failed_tools, successful_tool_names)

    Raises:
        RuntimeError: 所有工具均失败
    """
    # 导入任务函数
    from apps.scan.tasks.subdomain_discovery import run_subdomain_discovery_task

    # 生成时间戳（所有工具共用）
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    # TODO: 接入代理池管理系统
    # from apps.proxy.services import proxy_pool
    # proxy_stats = proxy_pool.get_stats()
    # logger.info(f"代理池状态: {proxy_stats['healthy']}/{proxy_stats['total']} 可用")

    failures = []  # 记录命令构建失败的工具
    futures = {}

    # 1. 构建命令并提交并行任务
    for tool_name, tool_config in enabled_tools.items():
        # 1.1 生成唯一的输出文件路径（绝对路径）
        short_uuid = uuid.uuid4().hex[:4]
        output_file = str(result_dir / f"{tool_name}_{timestamp}_{short_uuid}.txt")

        # 1.2 构建完整命令（变量替换）
        try:
            command = build_scan_command(
                tool_name=tool_name,
                scan_type='subdomain_discovery',
                command_params={
                    'domain': domain_name,      # 对应 {domain}
                    'output_file': output_file  # 对应 {output_file}
                },
                tool_config=tool_config
            )
        except Exception as e:
            failure_msg = f"{tool_name}: 命令构建失败 - {e}"
            failures.append(failure_msg)
            logger.error(f"构建 {tool_name} 命令失败: {e}")
            continue

        # 1.3 获取超时时间（支持 'auto' 动态计算）
        timeout = tool_config['timeout']
        if timeout == 'auto':
            # 子域名发现工具通常运行时间较长，使用默认值 600 秒
            timeout = 600
            logger.info(f"✓ 工具 {tool_name} 使用默认 timeout: {timeout}秒")

        # 1.4 提交任务
        logger.debug(
            f"提交任务 - 工具: {tool_name}, 超时: {timeout}s, 输出: {output_file}"
        )

        future = run_subdomain_discovery_task.submit(
            tool=tool_name,
            command=command,
            timeout=timeout,
            output_file=output_file
        )
        futures[tool_name] = future

    # 2. 检查是否有任何工具成功提交
    if not futures:
        logger.warning(
            "所有扫描工具均无法启动 - 目标: %s, 失败详情: %s",
            domain_name, "; ".join(failures)
        )
        # 返回空结果，不抛出异常，让扫描继续
        return [], [{'tool': 'all', 'reason': '所有工具均无法启动'}], []

    # 3. 等待并行任务完成，获取结果
    result_files = []
    failed_tools = []

    for tool_name, future in futures.items():
        try:
            result = future.result()  # 返回文件路径（字符串）或 ""（失败）
            if result:
                result_files.append(result)
                logger.info("✓ 扫描工具 %s 执行成功: %s", tool_name, result)
            else:
                failure_msg = f"{tool_name}: 未生成结果文件"
                failures.append(failure_msg)
                failed_tools.append({'tool': tool_name, 'reason': '未生成结果文件'})
                logger.warning("⚠️ 扫描工具 %s 未生成结果文件", tool_name)
        except Exception as e:
            failure_msg = f"{tool_name}: {str(e)}"
            failures.append(failure_msg)
            failed_tools.append({'tool': tool_name, 'reason': str(e)})
            logger.warning("⚠️ 扫描工具 %s 执行失败: %s", tool_name, str(e))

    # 4. 检查是否有成功的工具
    if not result_files:
        logger.warning(
            "所有扫描工具均失败 - 目标: %s, 失败详情: %s",
            domain_name, "; ".join(failures)
        )
        # 返回空结果，不抛出异常，让扫描继续
        return [], failed_tools, []

    # 5. 动态计算成功的工具列表
    successful_tool_names = [name for name in futures.keys()
                              if name not in [f['tool'] for f in failed_tools]]

    logger.info(
        "✓ 扫描工具并行执行完成 - 成功: %d/%d (成功: %s, 失败: %s)",
        len(result_files), len(futures),
        ', '.join(successful_tool_names) if successful_tool_names else '无',
        ', '.join([f['tool'] for f in failed_tools]) if failed_tools else '无'
    )

    return result_files, failed_tools, successful_tool_names


def _run_single_tool(
    tool_name: str,
    tool_config: dict,
    command_params: dict,
    result_dir: Path,
    scan_type: str = 'subdomain_discovery'
) -> str:
    """
    运行单个扫描工具

    Args:
        tool_name: 工具名称
        tool_config: 工具配置
        command_params: 命令参数
        result_dir: 结果目录
        scan_type: 扫描类型

    Returns:
        str: 输出文件路径，失败返回空字符串
    """
    from apps.scan.tasks.subdomain_discovery import run_subdomain_discovery_task

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    short_uuid = uuid.uuid4().hex[:4]
    output_file = str(result_dir / f"{tool_name}_{timestamp}_{short_uuid}.txt")

    # 添加 output_file 到参数
    command_params['output_file'] = output_file

    try:
        command = build_scan_command(
            tool_name=tool_name,
            scan_type=scan_type,
            command_params=command_params,
            tool_config=tool_config
        )
    except Exception as e:
        logger.error(f"构建 {tool_name} 命令失败: {e}")
        return ""

    timeout = tool_config.get('timeout', 3600)
    if timeout == 'auto':
        timeout = 3600

    logger.info(f"执行 {tool_name}: timeout={timeout}s")

    try:
        result = run_subdomain_discovery_task(
            tool=tool_name,
            command=command,
            timeout=timeout,
            output_file=output_file
        )
        return result if result else ""
    except Exception as e:
        logger.warning(f"{tool_name} 执行失败: {e}")
        return ""


def _count_lines(file_path: str) -> int:
    """
    统计文件非空行数

    Args:
        file_path: 文件路径

    Returns:
        int: 非空行数量
    """
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return sum(1 for line in f if line.strip())
    except Exception as e:
        logger.warning(f"统计文件行数失败: {file_path} - {e}")
        return 0


def _merge_files(file_list: list, output_file: str) -> str:
    """
    合并多个文件并去重

    Args:
        file_list: 文件路径列表
        output_file: 输出文件路径

    Returns:
        str: 输出文件路径
    """
    domains = set()
    for f in file_list:
        if f and Path(f).exists():
            with open(f, 'r', encoding='utf-8', errors='ignore') as fp:
                for line in fp:
                    line = line.strip()
                    if line:
                        domains.add(line)

    with open(output_file, 'w', encoding='utf-8') as fp:
        for domain in sorted(domains):
            fp.write(domain + '\n')

    logger.info(f"合并完成: {len(domains)} 个域名 -> {output_file}")
    return output_file


@flow(
    name="subdomain_discovery",
    log_prints=True,
    on_running=[on_scan_flow_running],
    on_completion=[on_scan_flow_completed],
    on_failure=[on_scan_flow_failed],
)
def subdomain_discovery_flow(
    scan_id: int,
    target_name: str,
    target_id: int,
    scan_workspace_dir: str,
    enabled_tools: dict
) -> dict:
    """子域名发现扫描流程

    工作流程（4 阶段）：
        Stage 1: 被动收集（并行） - 必选
        Stage 2: 字典爆破（可选） - 子域名字典爆破
        Stage 3: 变异生成 + 验证（可选） - dnsgen + 通用存活验证
        Stage 4: DNS 存活验证（可选） - 通用存活验证
        Final: 保存到数据库

    Args:
        scan_id: 扫描任务 ID
        target_name: 目标名称（域名）
        target_id: 目标 ID
        scan_workspace_dir: Scan 工作空间目录（由 Service 层创建）
        enabled_tools: 扫描配置字典:
            {
                'passive_tools': {...},
                'bruteforce': {...},
                'permutation': {...},
                'resolve': {...}
            }

    Returns:
        dict: 扫描结果

    Raises:
        ValueError: 配置错误
        RuntimeError: 执行失败
    """
    try:
        # ==================== 参数验证 ====================
        if scan_id is None:
            raise ValueError("scan_id 不能为空")
        if target_id is None:
            raise ValueError("target_id 不能为空")
        if not scan_workspace_dir:
            raise ValueError("scan_workspace_dir 不能为空")
        if enabled_tools is None:
            raise ValueError("enabled_tools 不能为空")

        scan_config = enabled_tools

        # 如果未提供目标域名，跳过扫描
        if not target_name:
            logger.warning("未提供目标域名，跳过子域名发现扫描")
            return _empty_result(scan_id, '', scan_workspace_dir)

        # 导入任务函数
        from apps.scan.tasks.subdomain_discovery import (
            run_subdomain_discovery_task,
            merge_and_validate_task,
            save_domains_task
        )

        # Step 0: 准备工作
        result_dir = _setup_subdomain_directory(scan_workspace_dir)

        # 验证并规范化目标域名
        try:
            domain_name = _validate_and_normalize_target(target_name)
        except ValueError as e:
            logger.warning("目标域名无效，跳过子域名发现扫描: %s", e)
            return _empty_result(scan_id, target_name, scan_workspace_dir)

        # 验证成功后打印日志
        logger.info(
            "="*60 + "\n" +
            "开始子域名发现扫描\n" +
            f"  Scan ID: {scan_id}\n" +
            f"  Domain: {domain_name}\n" +
            f"  Workspace: {scan_workspace_dir}\n" +
            "="*60
        )

        # 解析配置
        passive_tools = scan_config.get('passive_tools', {})
        bruteforce_config = scan_config.get('bruteforce', {})
        permutation_config = scan_config.get('permutation', {})
        resolve_config = scan_config.get('resolve', {})

        # 过滤出启用的被动工具
        enabled_passive_tools = {
            k: v for k, v in passive_tools.items()
            if v.get('enabled', True)
        }

        executed_tasks = []
        all_result_files = []
        failed_tools = []
        successful_tool_names = []

        # ==================== Stage 1: 被动收集（并行）====================
        logger.info("=" * 40)
        logger.info("Stage 1: 被动收集（并行）")
        logger.info("=" * 40)

        if enabled_passive_tools:
            logger.info("启用工具: %s", ', '.join(enabled_passive_tools.keys()))
            result_files, stage1_failed, stage1_success = _run_scans_parallel(
                enabled_tools=enabled_passive_tools,
                domain_name=domain_name,
                result_dir=result_dir
            )
            all_result_files.extend(result_files)
            failed_tools.extend(stage1_failed)
            successful_tool_names.extend(stage1_success)
            executed_tasks.extend([f'passive ({tool})' for tool in stage1_success])
        else:
            logger.warning("未启用任何被动收集工具")

        # 合并 Stage 1 结果
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        current_result = str(result_dir / f"subs_passive_{timestamp}.txt")
        if all_result_files:
            current_result = _merge_files(all_result_files, current_result)
            executed_tasks.append('merge_passive')
        else:
            # 创建空文件
            Path(current_result).touch()
            logger.warning("Stage 1 无结果，创建空文件")

        # ==================== Stage 2: 字典爆破（可选）====================
        bruteforce_enabled = bruteforce_config.get('enabled', False)
        if bruteforce_enabled:
            logger.info("=" * 40)
            logger.info("Stage 2: 字典爆破")
            logger.info("=" * 40)

            bruteforce_tool_config = bruteforce_config.get('subdomain_bruteforce', {})
            wordlist_name = bruteforce_tool_config.get('wordlist_name', 'dns_wordlist.txt')

            try:
                # 确保本地存在字典文件（含 hash 校验）
                local_wordlist_path = ensure_wordlist_local(wordlist_name)

                # 获取字典记录用于计算 timeout
                wordlist_service = WordlistService()
                wordlist = wordlist_service.get_wordlist_by_name(wordlist_name)

                timeout_value = bruteforce_tool_config.get('timeout', 3600)
                if timeout_value == 'auto' and wordlist:
                    line_count = getattr(wordlist, 'line_count', None)
                    if line_count is None:
                        try:
                            with open(local_wordlist_path, 'rb') as f:
                                line_count = sum(1 for _ in f)
                        except OSError:
                            line_count = 0

                    try:
                        line_count_int = int(line_count)
                    except (TypeError, ValueError):
                        line_count_int = 0

                    timeout_value = line_count_int * 3 if line_count_int > 0 else 3600
                    bruteforce_tool_config = {
                        **bruteforce_tool_config,
                        'timeout': timeout_value,
                    }
                    logger.info(
                        "subdomain_bruteforce 使用自动 timeout: %s 秒 (字典行数=%s, 3秒/行)",
                        timeout_value,
                        line_count_int,
                    )

                brute_output = str(result_dir / f"subs_brute_{timestamp}.txt")
                brute_result = _run_single_tool(
                    tool_name='subdomain_bruteforce',
                    tool_config=bruteforce_tool_config,
                    command_params={
                        'domain': domain_name,
                        'wordlist': local_wordlist_path,
                        'output_file': brute_output
                    },
                    result_dir=result_dir
                )

                if brute_result:
                    # 合并 Stage 1 + Stage 2
                    current_result = _merge_files(
                        [current_result, brute_result],
                        str(result_dir / f"subs_merged_{timestamp}.txt")
                    )
                    successful_tool_names.append('subdomain_bruteforce')
                    executed_tasks.append('bruteforce')
                else:
                    failed_tools.append({'tool': 'subdomain_bruteforce', 'reason': '执行失败'})
            except Exception as exc:
                logger.warning("字典准备失败，跳过字典爆破: %s", exc)
                failed_tools.append({'tool': 'subdomain_bruteforce', 'reason': str(exc)})

        # ==================== Stage 3: 变异生成 + 验证（可选）====================
        permutation_enabled = permutation_config.get('enabled', False)
        if permutation_enabled:
            logger.info("=" * 40)
            logger.info("Stage 3: 变异生成 + 存活验证（流式管道）")
            logger.info("=" * 40)

            permutation_tool_config = permutation_config.get('subdomain_permutation_resolve', {})

            # === Step 3.1: 泛解析采样检测 ===
            # 生成原文件 100 倍的变异样本，检查解析结果是否超过 50 倍
            before_count = _count_lines(current_result)

            # 配置参数
            SAMPLE_MULTIPLIER = 100  # 采样数量 = 原文件 × 100
            EXPANSION_THRESHOLD = 50  # 膨胀阈值 = 原文件 × 50
            SAMPLE_TIMEOUT = 7200  # 采样超时 2 小时

            sample_size = before_count * SAMPLE_MULTIPLIER
            max_allowed = before_count * EXPANSION_THRESHOLD

            sample_output = str(result_dir / f"subs_permuted_sample_{timestamp}.txt")
            sample_cmd = (
                f"cat {current_result} | dnsgen - | head -n {sample_size} | "
                f"puredns resolve -r /app/backend/resources/resolvers.txt "
                f"--write {sample_output} --wildcard-tests 50 --wildcard-batch 1000000 --quiet"
            )

            logger.info(
                f"泛解析采样检测: 原文件 {before_count} 个, "
                f"采样 {sample_size} 个, 阈值 {max_allowed} 个"
            )

            try:
                subprocess.run(
                    sample_cmd,
                    shell=True,
                    timeout=SAMPLE_TIMEOUT,
                    check=False,
                    capture_output=True
                )
                sample_result_count = _count_lines(sample_output) if Path(sample_output).exists() else 0

                logger.info(
                    f"采样结果: {sample_result_count} 个域名存活 "
                    f"(原文件: {before_count}, 阈值: {max_allowed})"
                )

                if sample_result_count > max_allowed:
                    # 采样结果超过阈值，说明存在泛解析，跳过完整变异
                    ratio = sample_result_count / before_count if before_count > 0 else sample_result_count
                    logger.warning(
                        f"跳过变异: 采样检测到泛解析 "
                        f"({sample_result_count} > {max_allowed}, 膨胀率 {ratio:.1f}x)"
                    )
                    failed_tools.append({
                        'tool': 'subdomain_permutation_resolve',
                        'reason': f"采样检测到泛解析 (膨胀率 {ratio:.1f}x)"
                    })
                else:
                    # === Step 3.2: 采样通过，执行完整变异 ===
                    logger.info("采样检测通过，执行完整变异...")

                    permuted_output = str(result_dir / f"subs_permuted_{timestamp}.txt")

                    permuted_result = _run_single_tool(
                        tool_name='subdomain_permutation_resolve',
                        tool_config=permutation_tool_config,
                        command_params={
                            'input_file': current_result,
                            'output_file': permuted_output,
                        },
                        result_dir=result_dir
                    )

                    if permuted_result:
                        # 合并原结果 + 变异验证结果
                        current_result = _merge_files(
                            [current_result, permuted_result],
                            str(result_dir / f"subs_with_permuted_{timestamp}.txt")
                        )
                        successful_tool_names.append('subdomain_permutation_resolve')
                        executed_tasks.append('permutation')
                    else:
                        failed_tools.append({'tool': 'subdomain_permutation_resolve', 'reason': '执行失败'})

            except subprocess.TimeoutExpired:
                logger.warning(f"采样检测超时 ({SAMPLE_TIMEOUT}秒)，跳过变异")
                failed_tools.append({'tool': 'subdomain_permutation_resolve', 'reason': '采样检测超时'})
            except Exception as e:
                logger.warning(f"采样检测失败: {e}，跳过变异")
                failed_tools.append({'tool': 'subdomain_permutation_resolve', 'reason': f'采样检测失败: {e}'})

        # ==================== Stage 4: DNS 存活验证（可选）====================
        # 无论是否启用 Stage 3，只要 resolve.enabled 为 true 就会执行，对当前所有候选子域做统一 DNS 验证
        resolve_enabled = resolve_config.get('enabled', False)
        if resolve_enabled:
            logger.info("=" * 40)
            logger.info("Stage 4: DNS 存活验证")
            logger.info("=" * 40)

            resolve_tool_config = resolve_config.get('subdomain_resolve', {})

            # 根据当前候选子域数量动态计算 timeout（支持 timeout: auto）
            timeout_value = resolve_tool_config.get('timeout', 3600)
            if timeout_value == 'auto':
                line_count = 0
                try:
                    with open(current_result, 'rb') as f:
                        line_count = sum(1 for _ in f)
                except OSError:
                    line_count = 0

                try:
                    line_count_int = int(line_count)
                except (TypeError, ValueError):
                    line_count_int = 0

                timeout_value = line_count_int * 3 if line_count_int > 0 else 3600
                resolve_tool_config = {
                    **resolve_tool_config,
                    'timeout': timeout_value,
                }
                logger.info(
                    "subdomain_resolve 使用自动 timeout: %s 秒 (候选子域数=%s, 3秒/域名)",
                    timeout_value,
                    line_count_int,
                )

            alive_output = str(result_dir / f"subs_alive_{timestamp}.txt")

            alive_result = _run_single_tool(
                tool_name='subdomain_resolve',
                tool_config=resolve_tool_config,
                command_params={
                    'input_file': current_result,
                    'output_file': alive_output,
                },
                result_dir=result_dir
            )

            if alive_result:
                current_result = alive_result
                successful_tool_names.append('subdomain_resolve')
                executed_tasks.append('resolve')
            else:
                failed_tools.append({'tool': 'subdomain_resolve', 'reason': '执行失败'})

        # ==================== Final: 保存到数据库 ====================
        logger.info("=" * 40)
        logger.info("Final: 保存到数据库")
        logger.info("=" * 40)

        # 最终验证和保存
        final_file = merge_and_validate_task(
            result_files=[current_result],
            result_dir=str(result_dir)
        )

        save_result = save_domains_task(
            domains_file=final_file,
            scan_id=scan_id,
            target_id=target_id
        )
        processed_domains = save_result.get('processed_records', 0)
        executed_tasks.append('save_domains')

        logger.info("="*60 + "\n✓ 子域名发现扫描完成\n" + "="*60)

        return {
            'success': True,
            'scan_id': scan_id,
            'target': domain_name,
            'scan_workspace_dir': scan_workspace_dir,
            'total': processed_domains,
            'executed_tasks': executed_tasks,
            'tool_stats': {
                'total': len(enabled_passive_tools) + (1 if bruteforce_enabled else 0) +
                         (1 if permutation_enabled else 0) + (1 if resolve_enabled else 0),
                'successful': len(successful_tool_names),
                'failed': len(failed_tools),
                'successful_tools': successful_tool_names,
                'failed_tools': failed_tools
            }
        }

    except ValueError as e:
        logger.error("配置错误: %s", e)
        raise
    except RuntimeError as e:
        logger.error("运行时错误: %s", e)
        raise
    except Exception as e:
        logger.exception("子域名发现扫描失败: %s", e)
        raise


def _empty_result(scan_id: int, target: str, scan_workspace_dir: str) -> dict:
    """返回空结果"""
    return {
        'success': True,
        'scan_id': scan_id,
        'target': target,
        'scan_workspace_dir': scan_workspace_dir,
        'total': 0,
        'executed_tasks': [],
        'tool_stats': {
            'total': 0,
            'successful': 0,
            'failed': 0,
            'successful_tools': [],
            'failed_tools': []
        }
    }