Files
xingrin/backend/apps/common/management/commands/db_monitor.py
2025-12-12 18:04:57 +08:00

165 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
简化的数据库性能监控命令
专注于可能导致查询延迟的关键指标
"""
import time
from django.core.management.base import BaseCommand
from django.db import connections
class Command(BaseCommand):
"""简化的数据库性能监控"""
help = '监控数据库性能关键指标'
def add_arguments(self, parser):
parser.add_argument(
'--interval',
type=int,
default=5,
help='监控间隔(秒,默认: 5',
)
parser.add_argument(
'--count',
type=int,
default=3,
help='监控次数(默认: 3',
)
def handle(self, *args, **options):
interval = options['interval']
count = options['count']
self.stdout.write("🔍 数据库性能监控开始...")
for i in range(count):
if i > 0:
time.sleep(interval)
self.stdout.write(f"\n=== 第 {i+1} 次监控 ===")
self.monitor_key_metrics()
def monitor_key_metrics(self):
"""监控关键性能指标"""
db_connection = connections['default']
try:
with db_connection.cursor() as cursor:
# 1. 连接和活动状态
cursor.execute("""
SELECT
count(*) as total_connections,
count(*) FILTER (WHERE state = 'active') as active,
count(*) FILTER (WHERE state = 'idle') as idle,
count(*) FILTER (WHERE state = 'idle in transaction') as idle_in_trans,
count(*) FILTER (WHERE wait_event_type IS NOT NULL) as waiting
FROM pg_stat_activity;
""")
conn_stats = cursor.fetchone()
self.stdout.write(f"连接: 总计{conn_stats[0]} | 活跃{conn_stats[1]} | 空闲{conn_stats[2]} | 事务中{conn_stats[3]} | 等待{conn_stats[4]}")
# 2. 锁等待情况
cursor.execute("""
SELECT
count(*) as total_locks,
count(*) FILTER (WHERE NOT granted) as waiting_locks
FROM pg_locks;
""")
lock_stats = cursor.fetchone()
if lock_stats[1] > 0:
self.stdout.write(self.style.WARNING(f"🔒 锁: 总计{lock_stats[0]} | 等待{lock_stats[1]}"))
else:
self.stdout.write(f"🔒 锁: 总计{lock_stats[0]} | 等待{lock_stats[1]}")
# 3. 长时间运行的查询
cursor.execute("""
SELECT
pid,
application_name,
now() - query_start as duration,
state,
left(query, 60) as query_preview
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < now() - interval '1 second'
AND query NOT LIKE '%pg_stat_activity%'
ORDER BY query_start;
""")
long_queries = cursor.fetchall()
if long_queries:
self.stdout.write(self.style.WARNING(f"⏱️ 长查询 ({len(long_queries)} 个):"))
for query in long_queries:
self.stdout.write(f" PID {query[0]} ({query[1]}): {query[2]} - {query[4]}...")
else:
self.stdout.write("⏱️ 长查询: 无")
# 4. 缓存命中率
cursor.execute("""
SELECT
sum(heap_blks_hit) as cache_hits,
sum(heap_blks_read) as disk_reads,
CASE
WHEN sum(heap_blks_hit) + sum(heap_blks_read) = 0 THEN 0
ELSE round(sum(heap_blks_hit) * 100.0 / (sum(heap_blks_hit) + sum(heap_blks_read)), 2)
END as hit_ratio
FROM pg_statio_user_tables;
""")
cache_stats = cursor.fetchone()
if cache_stats[0] or cache_stats[1]:
hit_ratio = cache_stats[2] or 0
if hit_ratio < 95:
self.stdout.write(self.style.WARNING(f"💾 缓存命中率: {hit_ratio}% (缓存:{cache_stats[0]}, 磁盘:{cache_stats[1]})"))
else:
self.stdout.write(f"💾 缓存命中率: {hit_ratio}% (缓存:{cache_stats[0]}, 磁盘:{cache_stats[1]})")
else:
self.stdout.write("💾 缓存: 暂无统计数据")
# 5. 检查点活动(尝试获取,如果失败则跳过)
try:
cursor.execute("SELECT * FROM pg_stat_bgwriter LIMIT 1;")
bgwriter_cols = [desc[0] for desc in cursor.description]
if 'checkpoints_timed' in bgwriter_cols:
cursor.execute("""
SELECT
checkpoints_timed,
checkpoints_req,
checkpoint_write_time,
checkpoint_sync_time
FROM pg_stat_bgwriter;
""")
bgwriter = cursor.fetchone()
total_checkpoints = bgwriter[0] + bgwriter[1]
if bgwriter[2] > 10000 or bgwriter[3] > 5000:
self.stdout.write(self.style.WARNING(f"📝 检查点: 总计{total_checkpoints} | 写入{bgwriter[2]}ms | 同步{bgwriter[3]}ms"))
else:
self.stdout.write(f"📝 检查点: 总计{total_checkpoints} | 写入{bgwriter[2]}ms | 同步{bgwriter[3]}ms")
else:
self.stdout.write("📝 检查点: 统计不可用")
except Exception:
self.stdout.write("📝 检查点: 统计不可用")
# 6. 数据库大小变化
cursor.execute("SELECT pg_database_size(current_database());")
db_size = cursor.fetchone()[0]
db_size_mb = round(db_size / 1024 / 1024, 2)
self.stdout.write(f"💿 数据库大小: {db_size_mb} MB")
# 7. 测试查询延迟
start_time = time.time()
cursor.execute("SELECT 1")
cursor.fetchone()
query_latency = (time.time() - start_time) * 1000
if query_latency > 500:
self.stdout.write(self.style.ERROR(f"⚡ 查询延迟: {query_latency:.2f}ms (高)"))
elif query_latency > 200:
self.stdout.write(self.style.WARNING(f"⚡ 查询延迟: {query_latency:.2f}ms (中)"))
else:
self.stdout.write(f"⚡ 查询延迟: {query_latency:.2f}ms (正常)")
except Exception as e:
self.stdout.write(self.style.ERROR(f"监控失败: {e}"))