backend/apps/common/utils/csv_utils.py

"""CSV 导出工具模块

提供流式 CSV 生成功能，支持：
- UTF-8 BOM（Excel 兼容）
- RFC 4180 规范转义
- 流式生成（内存友好）
- 带 Content-Length 的文件响应（支持浏览器下载进度显示）
"""

import csv
import io
import os
import tempfile
import logging
from datetime import datetime
from typing import Iterator, Dict, Any, List, Callable, Optional

from django.http import FileResponse, StreamingHttpResponse

logger = logging.getLogger(__name__)

# UTF-8 BOM，确保 Excel 正确识别编码
UTF8_BOM = '\ufeff'


def generate_csv_rows(
    data_iterator: Iterator[Dict[str, Any]],
    headers: List[str],
    field_formatters: Optional[Dict[str, Callable]] = None
) -> Iterator[str]:
    """
    流式生成 CSV 行
    
    Args:
        data_iterator: 数据迭代器，每个元素是一个字典
        headers: CSV 表头列表
        field_formatters: 字段格式化函数字典，key 为字段名，value 为格式化函数
    
    Yields:
        CSV 行字符串（包含换行符）
    
    Example:
        >>> data = [{'ip': '192.168.1.1', 'hosts': ['a.com', 'b.com']}]
        >>> headers = ['ip', 'hosts']
        >>> formatters = {'hosts': format_list_field}
        >>> for row in generate_csv_rows(iter(data), headers, formatters):
        ...     print(row, end='')
    """
    # 输出 BOM + 表头
    output = io.StringIO()
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(headers)
    yield UTF8_BOM + output.getvalue()
    
    # 输出数据行
    for row_data in data_iterator:
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
        
        row = []
        for header in headers:
            value = row_data.get(header, '')
            if field_formatters and header in field_formatters:
                value = field_formatters[header](value)
            row.append(value if value is not None else '')
        
        writer.writerow(row)
        yield output.getvalue()


def format_list_field(values: List, separator: str = ';') -> str:
    """
    将列表字段格式化为分号分隔的字符串
    
    Args:
        values: 值列表
        separator: 分隔符，默认为分号
    
    Returns:
        分隔符连接的字符串
    
    Example:
        >>> format_list_field(['a.com', 'b.com'])
        'a.com;b.com'
        >>> format_list_field([80, 443])
        '80;443'
        >>> format_list_field([])
        ''
        >>> format_list_field(None)
        ''
    """
    if not values:
        return ''
    return separator.join(str(v) for v in values)


def format_datetime(dt: Optional[datetime]) -> str:
    """
    格式化日期时间为字符串（转换为本地时区）
    
    Args:
        dt: datetime 对象或 None
    
    Returns:
        格式化的日期时间字符串，格式为 YYYY-MM-DD HH:MM:SS（本地时区）
    
    Example:
        >>> from datetime import datetime
        >>> format_datetime(datetime(2024, 1, 15, 10, 30, 0))
        '2024-01-15 10:30:00'
        >>> format_datetime(None)
        ''
    """
    if dt is None:
        return ''
    if isinstance(dt, str):
        return dt
    
    # 转换为本地时区（从 Django settings 获取）
    from django.utils import timezone
    if timezone.is_aware(dt):
        dt = timezone.localtime(dt)
    
    return dt.strftime('%Y-%m-%d %H:%M:%S')


def create_csv_export_response(
    data_iterator: Iterator[Dict[str, Any]],
    headers: List[str],
    filename: str,
    field_formatters: Optional[Dict[str, Callable]] = None,
    show_progress: bool = True
) -> FileResponse | StreamingHttpResponse:
    """
    创建 CSV 导出响应
    
    根据 show_progress 参数选择响应类型：
    - True: 使用临时文件 + FileResponse，带 Content-Length（浏览器显示下载进度）
    - False: 使用 StreamingHttpResponse（内存更友好，但无下载进度）
    
    Args:
        data_iterator: 数据迭代器，每个元素是一个字典
        headers: CSV 表头列表
        filename: 下载文件名（如 "export_2024.csv"）
        field_formatters: 字段格式化函数字典
        show_progress: 是否显示下载进度（默认 True）
    
    Returns:
        FileResponse 或 StreamingHttpResponse
    
    Example:
        >>> data_iter = service.iter_data()
        >>> headers = ['url', 'host', 'created_at']
        >>> formatters = {'created_at': format_datetime}
        >>> response = create_csv_export_response(
        ...     data_iter, headers, 'websites.csv', formatters
        ... )
        >>> return response
    """
    if show_progress:
        return _create_file_response(data_iterator, headers, filename, field_formatters)
    else:
        return _create_streaming_response(data_iterator, headers, filename, field_formatters)


def _create_file_response(
    data_iterator: Iterator[Dict[str, Any]],
    headers: List[str],
    filename: str,
    field_formatters: Optional[Dict[str, Callable]] = None
) -> FileResponse:
    """
    创建带 Content-Length 的文件响应（支持浏览器下载进度）
    
    实现方式：先写入临时文件，再返回 FileResponse
    """
    # 创建临时文件
    temp_file = tempfile.NamedTemporaryFile(
        mode='w', 
        suffix='.csv', 
        delete=False, 
        encoding='utf-8'
    )
    temp_path = temp_file.name
    
    try:
        # 流式写入 CSV 数据到临时文件
        for row in generate_csv_rows(data_iterator, headers, field_formatters):
            temp_file.write(row)
        temp_file.close()
        
        # 获取文件大小
        file_size = os.path.getsize(temp_path)
        
        # 创建文件响应
        response = FileResponse(
            open(temp_path, 'rb'),
            content_type='text/csv; charset=utf-8',
            as_attachment=True,
            filename=filename
        )
        response['Content-Length'] = file_size
        
        # 设置清理回调：响应完成后删除临时文件
        original_close = response.file_to_stream.close
        def close_and_cleanup():
            original_close()
            try:
                os.unlink(temp_path)
            except OSError:
                pass
        response.file_to_stream.close = close_and_cleanup
        
        return response
        
    except Exception as e:
        # 清理临时文件
        try:
            temp_file.close()
        except:
            pass
        try:
            os.unlink(temp_path)
        except OSError:
            pass
        logger.error(f"创建 CSV 导出响应失败: {e}")
        raise


def _create_streaming_response(
    data_iterator: Iterator[Dict[str, Any]],
    headers: List[str],
    filename: str,
    field_formatters: Optional[Dict[str, Callable]] = None
) -> StreamingHttpResponse:
    """
    创建流式响应（无 Content-Length，内存更友好）
    """
    response = StreamingHttpResponse(
        generate_csv_rows(data_iterator, headers, field_formatters),
        content_type='text/csv; charset=utf-8'
    )
    response['Content-Disposition'] = f'attachment; filename="{filename}"'
    return response
-												新增：ip add页面的下载为csv

											
										
										
											2025-12-23 12:34:41 +08:00
+								"""CSV 导出工具模块
 								提供流式 CSV 生成功能，支持：
 								- UTF-8 BOM（Excel 兼容）
 								- RFC 4180 规范转义
 								- 流式生成（内存友好）
-												feat(asset,scan): add configurable statement timeout and improve CSV export

- Add statement_timeout_ms parameter to search_service count() and stream_search() methods for long-running exports
- Replace server-side cursors with OFFSET/LIMIT batching for better Django compatibility
- Introduce create_csv_export_response() utility function to standardize CSV export handling
- Add engine-preset-selector and scan-config-editor components for enhanced scan configuration UI
- Update YAML editor component with improved styling and functionality
- Add i18n translations for new scan configuration features in English and Chinese
- Refactor CSV export endpoints to use new utility function instead of manual StreamingHttpResponse
- Remove unused uuid import from search_service.py
- Update nginx configuration for improved performance
- Enhance search service with configurable timeout support for large dataset exports

											
										
										
											2026-01-04 08:58:31 +08:00
+								- 带 Content-Length 的文件响应（支持浏览器下载进度显示）
-												新增：ip add页面的下载为csv

											
										
										
											2025-12-23 12:34:41 +08:00
+								"""
 								import csv
 								import io
-												feat(asset,scan): add configurable statement timeout and improve CSV export

- Add statement_timeout_ms parameter to search_service count() and stream_search() methods for long-running exports
- Replace server-side cursors with OFFSET/LIMIT batching for better Django compatibility
- Introduce create_csv_export_response() utility function to standardize CSV export handling
- Add engine-preset-selector and scan-config-editor components for enhanced scan configuration UI
- Update YAML editor component with improved styling and functionality
- Add i18n translations for new scan configuration features in English and Chinese
- Refactor CSV export endpoints to use new utility function instead of manual StreamingHttpResponse
- Remove unused uuid import from search_service.py
- Update nginx configuration for improved performance
- Enhance search service with configurable timeout support for large dataset exports

											
										
										
											2026-01-04 08:58:31 +08:00
+								import os
 								import tempfile
 								import logging
-												新增：ip add页面的下载为csv

											
										
										
											2025-12-23 12:34:41 +08:00
+								from datetime import datetime
 								from typing import Iterator, Dict, Any, List, Callable, Optional
-												feat(asset,scan): add configurable statement timeout and improve CSV export

- Add statement_timeout_ms parameter to search_service count() and stream_search() methods for long-running exports
- Replace server-side cursors with OFFSET/LIMIT batching for better Django compatibility
- Introduce create_csv_export_response() utility function to standardize CSV export handling
- Add engine-preset-selector and scan-config-editor components for enhanced scan configuration UI
- Update YAML editor component with improved styling and functionality
- Add i18n translations for new scan configuration features in English and Chinese
- Refactor CSV export endpoints to use new utility function instead of manual StreamingHttpResponse
- Remove unused uuid import from search_service.py
- Update nginx configuration for improved performance
- Enhance search service with configurable timeout support for large dataset exports

											
										
										
											2026-01-04 08:58:31 +08:00
+								from django.http import FileResponse, StreamingHttpResponse
 								logger = logging.getLogger(__name__)
-												新增：ip add页面的下载为csv

											
										
										
											2025-12-23 12:34:41 +08:00
+								# UTF-8 BOM，确保 Excel 正确识别编码
 								UTF8_BOM = '\ufeff'
 								def generate_csv_rows(
 								    data_iterator: Iterator[Dict[str, Any]],
 								    headers: List[str],
 								    field_formatters: Optional[Dict[str, Callable]] = None
 								) -> Iterator[str]:
 								    """
 								    流式生成 CSV 行
 								    Args:
 								        data_iterator: 数据迭代器，每个元素是一个字典
 								        headers: CSV 表头列表
 								        field_formatters: 字段格式化函数字典，key 为字段名，value 为格式化函数
 								    Yields:
 								        CSV 行字符串（包含换行符）
 								    Example:
 								        >>> data = [{'ip': '192.168.1.1', 'hosts': ['a.com', 'b.com']}]
 								        >>> headers = ['ip', 'hosts']
 								        >>> formatters = {'hosts': format_list_field}
 								        >>> for row in generate_csv_rows(iter(data), headers, formatters):
 								        ...     print(row, end='')
 								    """
 								    # 输出 BOM + 表头
 								    output = io.StringIO()
 								    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
 								    writer.writerow(headers)
 								    yield UTF8_BOM + output.getvalue()
 								    # 输出数据行
 								    for row_data in data_iterator:
 								        output = io.StringIO()
 								        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
 								        row = []
 								        for header in headers:
 								            value = row_data.get(header, '')
 								            if field_formatters and header in field_formatters:
 								                value = field_formatters[header](value)
 								            row.append(value if value is not None else '')
 								        writer.writerow(row)
 								        yield output.getvalue()
 								def format_list_field(values: List, separator: str = ';') -> str:
 								    """
 								    将列表字段格式化为分号分隔的字符串
 								    Args:
 								        values: 值列表
 								        separator: 分隔符，默认为分号
 								    Returns:
 								        分隔符连接的字符串
 								    Example:
 								        >>> format_list_field(['a.com', 'b.com'])
 								        'a.com;b.com'
 								        >>> format_list_field([80, 443])
 								        '80;443'
 								        >>> format_list_field([])
 								        ''
 								        >>> format_list_field(None)
 								        ''
 								    """
 								    if not values:
 								        return ''
 								    return separator.join(str(v) for v in values)
 								def format_datetime(dt: Optional[datetime]) -> str:
 								    """
 								    格式化日期时间为字符串（转换为本地时区）
 								    Args:
 								        dt: datetime 对象或 None
 								    Returns:
 								        格式化的日期时间字符串，格式为 YYYY-MM-DD HH:MM:SS（本地时区）
 								    Example:
 								        >>> from datetime import datetime
 								        >>> format_datetime(datetime(2024, 1, 15, 10, 30, 0))
 								        '2024-01-15 10:30:00'
 								        >>> format_datetime(None)
 								        ''
 								    """
 								    if dt is None:
 								        return ''
 								    if isinstance(dt, str):
 								        return dt
 								    # 转换为本地时区（从 Django settings 获取）
 								    from django.utils import timezone
 								    if timezone.is_aware(dt):
 								        dt = timezone.localtime(dt)
 								    return dt.strftime('%Y-%m-%d %H:%M:%S')
-												feat(asset,scan): add configurable statement timeout and improve CSV export

- Add statement_timeout_ms parameter to search_service count() and stream_search() methods for long-running exports
- Replace server-side cursors with OFFSET/LIMIT batching for better Django compatibility
- Introduce create_csv_export_response() utility function to standardize CSV export handling
- Add engine-preset-selector and scan-config-editor components for enhanced scan configuration UI
- Update YAML editor component with improved styling and functionality
- Add i18n translations for new scan configuration features in English and Chinese
- Refactor CSV export endpoints to use new utility function instead of manual StreamingHttpResponse
- Remove unused uuid import from search_service.py
- Update nginx configuration for improved performance
- Enhance search service with configurable timeout support for large dataset exports

											
										
										
											2026-01-04 08:58:31 +08:00
 								def create_csv_export_response(
 								    data_iterator: Iterator[Dict[str, Any]],
 								    headers: List[str],
 								    filename: str,
 								    field_formatters: Optional[Dict[str, Callable]] = None,
 								    show_progress: bool = True
 								) -> FileResponse | StreamingHttpResponse:
 								    """
 								    创建 CSV 导出响应
 								    根据 show_progress 参数选择响应类型：
 								    - True: 使用临时文件 + FileResponse，带 Content-Length（浏览器显示下载进度）
 								    - False: 使用 StreamingHttpResponse（内存更友好，但无下载进度）
 								    Args:
 								        data_iterator: 数据迭代器，每个元素是一个字典
 								        headers: CSV 表头列表
 								        filename: 下载文件名（如 "export_2024.csv"）
 								        field_formatters: 字段格式化函数字典
 								        show_progress: 是否显示下载进度（默认 True）
 								    Returns:
 								        FileResponse 或 StreamingHttpResponse
 								    Example:
 								        >>> data_iter = service.iter_data()
 								        >>> headers = ['url', 'host', 'created_at']
 								        >>> formatters = {'created_at': format_datetime}
 								        >>> response = create_csv_export_response(
 								        ...     data_iter, headers, 'websites.csv', formatters
 								        ... )
 								        >>> return response
 								    """
 								    if show_progress:
 								        return _create_file_response(data_iterator, headers, filename, field_formatters)
 								    else:
 								        return _create_streaming_response(data_iterator, headers, filename, field_formatters)
 								def _create_file_response(
 								    data_iterator: Iterator[Dict[str, Any]],
 								    headers: List[str],
 								    filename: str,
 								    field_formatters: Optional[Dict[str, Callable]] = None
 								) -> FileResponse:
 								    """
 								    创建带 Content-Length 的文件响应（支持浏览器下载进度）
 								    实现方式：先写入临时文件，再返回 FileResponse
 								    """
 								    # 创建临时文件
 								    temp_file = tempfile.NamedTemporaryFile(
 								        mode='w',
 								        suffix='.csv',
 								        delete=False,
 								        encoding='utf-8'
 								    )
 								    temp_path = temp_file.name
 								    try:
 								        # 流式写入 CSV 数据到临时文件
 								        for row in generate_csv_rows(data_iterator, headers, field_formatters):
 								            temp_file.write(row)
 								        temp_file.close()
 								        # 获取文件大小
 								        file_size = os.path.getsize(temp_path)
 								        # 创建文件响应
 								        response = FileResponse(
 								            open(temp_path, 'rb'),
 								            content_type='text/csv; charset=utf-8',
 								            as_attachment=True,
 								            filename=filename
 								        )
 								        response['Content-Length'] = file_size
 								        # 设置清理回调：响应完成后删除临时文件
 								        original_close = response.file_to_stream.close
 								        def close_and_cleanup():
 								            original_close()
 								            try:
 								                os.unlink(temp_path)
 								            except OSError:
 								                pass
 								        response.file_to_stream.close = close_and_cleanup
 								        return response
 								    except Exception as e:
 								        # 清理临时文件
 								        try:
 								            temp_file.close()
 								        except:
 								            pass
 								        try:
 								            os.unlink(temp_path)
 								        except OSError:
 								            pass
 								        logger.error(f"创建 CSV 导出响应失败: {e}")
 								        raise
 								def _create_streaming_response(
 								    data_iterator: Iterator[Dict[str, Any]],
 								    headers: List[str],
 								    filename: str,
 								    field_formatters: Optional[Dict[str, Callable]] = None
 								) -> StreamingHttpResponse:
 								    """
 								    创建流式响应（无 Content-Length，内存更友好）
 								    """
 								    response = StreamingHttpResponse(
 								        generate_csv_rows(data_iterator, headers, field_formatters),
 								        content_type='text/csv; charset=utf-8'
 								    )
 								    response['Content-Disposition'] = f'attachment; filename="{filename}"'
 								    return response