mirror of
https://github.com/yyhuni/xingrin.git
synced 2026-02-02 04:33:10 +08:00
206 lines
6.0 KiB
Python
206 lines
6.0 KiB
Python
"""
|
||
URL 清理任务
|
||
|
||
使用 uro 工具清理合并后的 URL 列表:
|
||
- 去除重复和相似的 URL
|
||
- 根据扩展名过滤(whitelist/blacklist)
|
||
- 智能过滤无效 URL
|
||
"""
|
||
|
||
import logging
|
||
import subprocess
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
from prefect import task
|
||
from typing import Optional
|
||
|
||
from apps.scan.utils import execute_and_wait
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@task(
|
||
name='clean_urls_with_uro',
|
||
retries=1,
|
||
log_prints=True
|
||
)
|
||
def clean_urls_task(
|
||
input_file: str,
|
||
output_dir: str,
|
||
timeout: int = 60,
|
||
whitelist: Optional[list] = None,
|
||
blacklist: Optional[list] = None,
|
||
filters: Optional[list] = None
|
||
) -> dict:
|
||
"""
|
||
使用 uro 清理 URL 列表
|
||
|
||
Args:
|
||
input_file: 输入的 URL 文件路径
|
||
output_dir: 输出目录
|
||
timeout: 超时时间(秒)
|
||
whitelist: 只保留指定扩展名的 URL
|
||
blacklist: 排除指定扩展名的 URL
|
||
filters: 额外的过滤规则
|
||
|
||
Returns:
|
||
dict: {
|
||
'success': bool,
|
||
'output_file': str,
|
||
'input_count': int,
|
||
'output_count': int,
|
||
'removed_count': int
|
||
}
|
||
"""
|
||
input_path = Path(input_file)
|
||
output_path = Path(output_dir)
|
||
|
||
# 1. 验证输入文件
|
||
if not input_path.exists():
|
||
logger.error("输入文件不存在: %s", input_file)
|
||
return {
|
||
'success': False,
|
||
'output_file': input_file,
|
||
'input_count': 0,
|
||
'output_count': 0,
|
||
'removed_count': 0,
|
||
'error': '输入文件不存在'
|
||
}
|
||
|
||
# 2. 统计输入 URL 数量
|
||
try:
|
||
result = subprocess.run(
|
||
['wc', '-l', str(input_path)],
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
input_count = int(result.stdout.strip().split()[0])
|
||
except Exception as e:
|
||
logger.warning("统计输入文件行数失败: %s", e)
|
||
input_count = 0
|
||
with open(input_path, 'r') as f:
|
||
input_count = sum(1 for line in f if line.strip())
|
||
|
||
if input_count == 0:
|
||
logger.warning("输入文件为空,跳过 uro 清理")
|
||
return {
|
||
'success': True,
|
||
'output_file': input_file,
|
||
'input_count': 0,
|
||
'output_count': 0,
|
||
'removed_count': 0
|
||
}
|
||
|
||
logger.info("开始 uro 清理 - 输入 URL 数: %d", input_count)
|
||
|
||
# 3. 生成输出文件路径
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
output_file = output_path / f"urls_cleaned_{timestamp}.txt"
|
||
|
||
# 4. 构建 uro 命令
|
||
cmd_parts = ['uro', '-i', str(input_path), '-o', str(output_file)]
|
||
|
||
if whitelist:
|
||
cmd_parts.extend(['-w'] + [str(w) for w in whitelist])
|
||
|
||
if blacklist:
|
||
cmd_parts.extend(['-b'] + [str(b) for b in blacklist])
|
||
|
||
if filters:
|
||
cmd_parts.extend(['-f'] + [str(f) for f in filters])
|
||
|
||
# 5. 构建命令字符串
|
||
command = ' '.join(cmd_parts)
|
||
log_file = str(output_path / f"uro_{timestamp}.log")
|
||
|
||
logger.debug("uro 命令: %s", command)
|
||
|
||
# 6. 使用 execute_and_wait 执行(会自动发送通知)
|
||
try:
|
||
result = execute_and_wait(
|
||
tool_name='uro',
|
||
command=command,
|
||
timeout=timeout,
|
||
log_file=log_file
|
||
)
|
||
|
||
if result['returncode'] != 0:
|
||
logger.warning(
|
||
"uro 返回非零状态码: %d",
|
||
result['returncode']
|
||
)
|
||
# uro 可能正常完成但返回非零,检查输出文件
|
||
if not output_file.exists():
|
||
return {
|
||
'success': False,
|
||
'output_file': input_file,
|
||
'input_count': input_count,
|
||
'output_count': input_count,
|
||
'removed_count': 0,
|
||
'error': f'uro 执行失败 (returncode: {result["returncode"]})'
|
||
}
|
||
|
||
except RuntimeError as e:
|
||
# execute_and_wait 超时或执行失败会抛出 RuntimeError
|
||
logger.error("uro 执行失败: %s", e)
|
||
return {
|
||
'success': False,
|
||
'output_file': input_file,
|
||
'input_count': input_count,
|
||
'output_count': input_count,
|
||
'removed_count': 0,
|
||
'error': str(e)
|
||
}
|
||
except Exception as e:
|
||
logger.error("uro 执行异常: %s", e)
|
||
return {
|
||
'success': False,
|
||
'output_file': input_file,
|
||
'input_count': input_count,
|
||
'output_count': input_count,
|
||
'removed_count': 0,
|
||
'error': str(e)
|
||
}
|
||
|
||
# 7. 统计清理后的 URL 数量
|
||
output_count = 0
|
||
if output_file.exists():
|
||
try:
|
||
result = subprocess.run(
|
||
['wc', '-l', str(output_file)],
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
output_count = int(result.stdout.strip().split()[0])
|
||
except Exception:
|
||
with open(output_file, 'r') as f:
|
||
output_count = sum(1 for line in f if line.strip())
|
||
else:
|
||
logger.warning("uro 未生成输出文件,使用原始文件")
|
||
return {
|
||
'success': False,
|
||
'output_file': input_file,
|
||
'input_count': input_count,
|
||
'output_count': input_count,
|
||
'removed_count': 0,
|
||
'error': '未生成输出文件'
|
||
}
|
||
|
||
removed_count = input_count - output_count
|
||
|
||
logger.info(
|
||
"✓ uro 清理完成 - 输入: %d, 输出: %d, 移除: %d (%.1f%%)",
|
||
input_count, output_count, removed_count,
|
||
(removed_count / input_count * 100) if input_count > 0 else 0
|
||
)
|
||
|
||
return {
|
||
'success': True,
|
||
'output_file': str(output_file),
|
||
'input_count': input_count,
|
||
'output_count': output_count,
|
||
'removed_count': removed_count
|
||
}
|