Files
xingrin/backend/apps/scan/tasks/url_fetch/clean_urls_task.py
2025-12-12 18:04:57 +08:00

206 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
URL 清理任务
使用 uro 工具清理合并后的 URL 列表:
- 去除重复和相似的 URL
- 根据扩展名过滤whitelist/blacklist
- 智能过滤无效 URL
"""
import logging
import subprocess
from pathlib import Path
from datetime import datetime
from prefect import task
from typing import Optional
from apps.scan.utils import execute_and_wait
logger = logging.getLogger(__name__)
@task(
name='clean_urls_with_uro',
retries=1,
log_prints=True
)
def clean_urls_task(
input_file: str,
output_dir: str,
timeout: int = 60,
whitelist: Optional[list] = None,
blacklist: Optional[list] = None,
filters: Optional[list] = None
) -> dict:
"""
使用 uro 清理 URL 列表
Args:
input_file: 输入的 URL 文件路径
output_dir: 输出目录
timeout: 超时时间(秒)
whitelist: 只保留指定扩展名的 URL
blacklist: 排除指定扩展名的 URL
filters: 额外的过滤规则
Returns:
dict: {
'success': bool,
'output_file': str,
'input_count': int,
'output_count': int,
'removed_count': int
}
"""
input_path = Path(input_file)
output_path = Path(output_dir)
# 1. 验证输入文件
if not input_path.exists():
logger.error("输入文件不存在: %s", input_file)
return {
'success': False,
'output_file': input_file,
'input_count': 0,
'output_count': 0,
'removed_count': 0,
'error': '输入文件不存在'
}
# 2. 统计输入 URL 数量
try:
result = subprocess.run(
['wc', '-l', str(input_path)],
capture_output=True,
text=True,
check=True
)
input_count = int(result.stdout.strip().split()[0])
except Exception as e:
logger.warning("统计输入文件行数失败: %s", e)
input_count = 0
with open(input_path, 'r') as f:
input_count = sum(1 for line in f if line.strip())
if input_count == 0:
logger.warning("输入文件为空,跳过 uro 清理")
return {
'success': True,
'output_file': input_file,
'input_count': 0,
'output_count': 0,
'removed_count': 0
}
logger.info("开始 uro 清理 - 输入 URL 数: %d", input_count)
# 3. 生成输出文件路径
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = output_path / f"urls_cleaned_{timestamp}.txt"
# 4. 构建 uro 命令
cmd_parts = ['uro', '-i', str(input_path), '-o', str(output_file)]
if whitelist:
cmd_parts.extend(['-w'] + [str(w) for w in whitelist])
if blacklist:
cmd_parts.extend(['-b'] + [str(b) for b in blacklist])
if filters:
cmd_parts.extend(['-f'] + [str(f) for f in filters])
# 5. 构建命令字符串
command = ' '.join(cmd_parts)
log_file = str(output_path / f"uro_{timestamp}.log")
logger.debug("uro 命令: %s", command)
# 6. 使用 execute_and_wait 执行(会自动发送通知)
try:
result = execute_and_wait(
tool_name='uro',
command=command,
timeout=timeout,
log_file=log_file
)
if result['returncode'] != 0:
logger.warning(
"uro 返回非零状态码: %d",
result['returncode']
)
# uro 可能正常完成但返回非零,检查输出文件
if not output_file.exists():
return {
'success': False,
'output_file': input_file,
'input_count': input_count,
'output_count': input_count,
'removed_count': 0,
'error': f'uro 执行失败 (returncode: {result["returncode"]})'
}
except RuntimeError as e:
# execute_and_wait 超时或执行失败会抛出 RuntimeError
logger.error("uro 执行失败: %s", e)
return {
'success': False,
'output_file': input_file,
'input_count': input_count,
'output_count': input_count,
'removed_count': 0,
'error': str(e)
}
except Exception as e:
logger.error("uro 执行异常: %s", e)
return {
'success': False,
'output_file': input_file,
'input_count': input_count,
'output_count': input_count,
'removed_count': 0,
'error': str(e)
}
# 7. 统计清理后的 URL 数量
output_count = 0
if output_file.exists():
try:
result = subprocess.run(
['wc', '-l', str(output_file)],
capture_output=True,
text=True,
check=True
)
output_count = int(result.stdout.strip().split()[0])
except Exception:
with open(output_file, 'r') as f:
output_count = sum(1 for line in f if line.strip())
else:
logger.warning("uro 未生成输出文件,使用原始文件")
return {
'success': False,
'output_file': input_file,
'input_count': input_count,
'output_count': input_count,
'removed_count': 0,
'error': '未生成输出文件'
}
removed_count = input_count - output_count
logger.info(
"✓ uro 清理完成 - 输入: %d, 输出: %d, 移除: %d (%.1f%%)",
input_count, output_count, removed_count,
(removed_count / input_count * 100) if input_count > 0 else 0
)
return {
'success': True,
'output_file': str(output_file),
'input_count': input_count,
'output_count': output_count,
'removed_count': removed_count
}