diff --git a/backend/apps/asset/apps.py b/backend/apps/asset/apps.py index c34b78eb..dd0ec9ee 100644 --- a/backend/apps/asset/apps.py +++ b/backend/apps/asset/apps.py @@ -1,5 +1,9 @@ +import logging + from django.apps import AppConfig +logger = logging.getLogger(__name__) + class AssetConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' @@ -8,3 +12,34 @@ class AssetConfig(AppConfig): def ready(self): # 导入所有模型以确保Django发现并注册 from . import models + + # 启用 pg_trgm 扩展(用于文本模糊搜索索引) + # 用于已有数据库升级场景 + self._ensure_pg_trgm_extension() + + def _ensure_pg_trgm_extension(self): + """ + 确保 pg_trgm 扩展已启用。 + 该扩展用于 response_body 和 response_headers 字段的 GIN 索引, + 支持高效的文本模糊搜索。 + """ + from django.db import connection + + # 检查是否为 PostgreSQL 数据库 + if connection.vendor != 'postgresql': + logger.debug("跳过 pg_trgm 扩展:当前数据库不是 PostgreSQL") + return + + try: + with connection.cursor() as cursor: + cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;") + logger.debug("pg_trgm 扩展已启用") + except Exception as e: + # 记录错误但不阻止应用启动 + # 常见原因:权限不足(需要超级用户权限) + logger.warning( + "无法创建 pg_trgm 扩展: %s。" + "这可能导致 response_body 和 response_headers 字段的 GIN 索引无法正常工作。" + "请手动执行: CREATE EXTENSION IF NOT EXISTS pg_trgm;", + str(e) + ) diff --git a/backend/apps/asset/dtos/asset/endpoint_dto.py b/backend/apps/asset/dtos/asset/endpoint_dto.py index 94a393ca..caef414b 100644 --- a/backend/apps/asset/dtos/asset/endpoint_dto.py +++ b/backend/apps/asset/dtos/asset/endpoint_dto.py @@ -1,7 +1,7 @@ """Endpoint DTO""" from dataclasses import dataclass -from typing import Optional, List, Dict, Any +from typing import Optional, List @dataclass @@ -14,18 +14,16 @@ class EndpointDTO: status_code: Optional[int] = None content_length: Optional[int] = None webserver: Optional[str] = None - body_preview: Optional[str] = None + response_body: Optional[str] = None content_type: Optional[str] = None tech: Optional[List[str]] = None vhost: Optional[bool] = None location: Optional[str] = None matched_gf_patterns: Optional[List[str]] = None - response_headers: Optional[Dict[str, Any]] = None + response_headers: Optional[str] = None def __post_init__(self): if self.tech is None: self.tech = [] if self.matched_gf_patterns is None: self.matched_gf_patterns = [] - if self.response_headers is None: - self.response_headers = {} diff --git a/backend/apps/asset/dtos/asset/website_dto.py b/backend/apps/asset/dtos/asset/website_dto.py index 149055b9..4812c8ea 100644 --- a/backend/apps/asset/dtos/asset/website_dto.py +++ b/backend/apps/asset/dtos/asset/website_dto.py @@ -1,7 +1,7 @@ """WebSite DTO""" from dataclasses import dataclass -from typing import List, Optional, Dict, Any +from typing import List, Optional @dataclass @@ -17,13 +17,11 @@ class WebSiteDTO: webserver: str = '' content_type: str = '' tech: List[str] = None - body_preview: str = '' + response_body: str = '' vhost: Optional[bool] = None created_at: str = None - response_headers: Dict[str, Any] = None + response_headers: str = '' def __post_init__(self): if self.tech is None: self.tech = [] - if self.response_headers is None: - self.response_headers = {} diff --git a/backend/apps/asset/dtos/snapshot/endpoint_snapshot_dto.py b/backend/apps/asset/dtos/snapshot/endpoint_snapshot_dto.py index c8940a80..8c768b3b 100644 --- a/backend/apps/asset/dtos/snapshot/endpoint_snapshot_dto.py +++ b/backend/apps/asset/dtos/snapshot/endpoint_snapshot_dto.py @@ -1,7 +1,7 @@ """EndpointSnapshot DTO""" from dataclasses import dataclass -from typing import List, Optional, Dict, Any +from typing import List, Optional @dataclass @@ -22,19 +22,17 @@ class EndpointSnapshotDTO: webserver: str = '' content_type: str = '' tech: List[str] = None - body_preview: str = '' + response_body: str = '' vhost: Optional[bool] = None matched_gf_patterns: List[str] = None target_id: Optional[int] = None # 冗余字段,用于同步到资产表 - response_headers: Dict[str, Any] = None + response_headers: str = '' def __post_init__(self): if self.tech is None: self.tech = [] if self.matched_gf_patterns is None: self.matched_gf_patterns = [] - if self.response_headers is None: - self.response_headers = {} def to_asset_dto(self): """ @@ -56,11 +54,11 @@ class EndpointSnapshotDTO: status_code=self.status_code, content_length=self.content_length, webserver=self.webserver, - body_preview=self.body_preview, + response_body=self.response_body, content_type=self.content_type, tech=self.tech if self.tech else [], vhost=self.vhost, location=self.location, matched_gf_patterns=self.matched_gf_patterns if self.matched_gf_patterns else [], - response_headers=self.response_headers if self.response_headers else {}, + response_headers=self.response_headers, ) diff --git a/backend/apps/asset/dtos/snapshot/website_snapshot_dto.py b/backend/apps/asset/dtos/snapshot/website_snapshot_dto.py index 0c37be91..8bf35123 100644 --- a/backend/apps/asset/dtos/snapshot/website_snapshot_dto.py +++ b/backend/apps/asset/dtos/snapshot/website_snapshot_dto.py @@ -1,7 +1,7 @@ """WebsiteSnapshot DTO""" from dataclasses import dataclass -from typing import List, Optional, Dict, Any +from typing import List, Optional @dataclass @@ -23,15 +23,13 @@ class WebsiteSnapshotDTO: web_server: str = '' content_type: str = '' tech: List[str] = None - body_preview: str = '' + response_body: str = '' vhost: Optional[bool] = None - response_headers: Dict[str, Any] = None + response_headers: str = '' def __post_init__(self): if self.tech is None: self.tech = [] - if self.response_headers is None: - self.response_headers = {} def to_asset_dto(self): """ @@ -53,7 +51,7 @@ class WebsiteSnapshotDTO: webserver=self.web_server, content_type=self.content_type, tech=self.tech if self.tech else [], - body_preview=self.body_preview, + response_body=self.response_body, vhost=self.vhost, - response_headers=self.response_headers if self.response_headers else {}, + response_headers=self.response_headers, ) diff --git a/backend/apps/asset/models/asset_models.py b/backend/apps/asset/models/asset_models.py index 17a97974..d93c9366 100644 --- a/backend/apps/asset/models/asset_models.py +++ b/backend/apps/asset/models/asset_models.py @@ -85,11 +85,10 @@ class Endpoint(models.Model): default='', help_text='服务器类型(HTTP 响应头 Server 值)' ) - body_preview = models.CharField( - max_length=1000, + response_body = models.TextField( blank=True, default='', - help_text='响应正文前N个字符(默认100个字符)' + help_text='HTTP响应体' ) content_type = models.CharField( max_length=200, @@ -124,10 +123,10 @@ class Endpoint(models.Model): default=list, help_text='匹配的GF模式列表,用于识别敏感端点(如api, debug, config等)' ) - response_headers = models.JSONField( + response_headers = models.TextField( blank=True, - default=dict, - help_text='HTTP响应头(JSON格式)' + default='', + help_text='原始HTTP响应头' ) class Meta: @@ -143,7 +142,12 @@ class Endpoint(models.Model): models.Index(fields=['status_code']), # 状态码索引,优化筛选 models.Index(fields=['title']), # title索引,优化智能过滤搜索 GinIndex(fields=['tech']), # GIN索引,优化 tech 数组字段的 __contains 查询 - GinIndex(fields=['response_headers']), # GIN索引,优化 response_headers JSON 字段查询 + # pg_trgm GIN 索引,支持 LIKE '%keyword%' 模糊搜索 + GinIndex( + name='endpoint_resp_headers_trgm_idx', + fields=['response_headers'], + opclasses=['gin_trgm_ops'] + ), ] constraints = [ # 普通唯一约束:url + target 组合唯一 @@ -194,11 +198,10 @@ class WebSite(models.Model): default='', help_text='服务器类型(HTTP 响应头 Server 值)' ) - body_preview = models.CharField( - max_length=1000, + response_body = models.TextField( blank=True, default='', - help_text='响应正文前N个字符(默认100个字符)' + help_text='HTTP响应体' ) content_type = models.CharField( max_length=200, @@ -227,10 +230,10 @@ class WebSite(models.Model): blank=True, help_text='是否支持虚拟主机' ) - response_headers = models.JSONField( + response_headers = models.TextField( blank=True, - default=dict, - help_text='HTTP响应头(JSON格式)' + default='', + help_text='原始HTTP响应头' ) class Meta: @@ -246,7 +249,12 @@ class WebSite(models.Model): models.Index(fields=['title']), # title索引,优化智能过滤搜索 models.Index(fields=['status_code']), # 状态码索引,优化智能过滤搜索 GinIndex(fields=['tech']), # GIN索引,优化 tech 数组字段的 __contains 查询 - GinIndex(fields=['response_headers']), # GIN索引,优化 response_headers JSON 字段查询 + # pg_trgm GIN 索引,支持 LIKE '%keyword%' 模糊搜索 + GinIndex( + name='website_resp_headers_trgm_idx', + fields=['response_headers'], + opclasses=['gin_trgm_ops'] + ), ] constraints = [ # 普通唯一约束:url + target 组合唯一 diff --git a/backend/apps/asset/models/snapshot_models.py b/backend/apps/asset/models/snapshot_models.py index 498eaebd..91988a94 100644 --- a/backend/apps/asset/models/snapshot_models.py +++ b/backend/apps/asset/models/snapshot_models.py @@ -69,12 +69,12 @@ class WebsiteSnapshot(models.Model): default=list, help_text='技术栈' ) - body_preview = models.TextField(blank=True, default='', help_text='响应体预览') + response_body = models.TextField(blank=True, default='', help_text='HTTP响应体') vhost = models.BooleanField(null=True, blank=True, help_text='虚拟主机标志') - response_headers = models.JSONField( + response_headers = models.TextField( blank=True, - default=dict, - help_text='HTTP响应头(JSON格式)' + default='', + help_text='原始HTTP响应头' ) created_at = models.DateTimeField(auto_now_add=True, help_text='创建时间') @@ -90,7 +90,12 @@ class WebsiteSnapshot(models.Model): models.Index(fields=['title']), # title索引,优化标题搜索 models.Index(fields=['-created_at']), GinIndex(fields=['tech']), # GIN索引,优化数组字段查询 - GinIndex(fields=['response_headers']), # GIN索引,优化 JSON 字段查询 + # pg_trgm GIN 索引,支持 LIKE '%keyword%' 模糊搜索 + GinIndex( + name='ws_snap_resp_hdr_trgm', + fields=['response_headers'], + opclasses=['gin_trgm_ops'] + ), ] constraints = [ # 唯一约束:同一次扫描中,同一个URL只能记录一次 @@ -259,7 +264,7 @@ class EndpointSnapshot(models.Model): default=list, help_text='技术栈' ) - body_preview = models.CharField(max_length=1000, blank=True, default='', help_text='响应体预览') + response_body = models.TextField(blank=True, default='', help_text='HTTP响应体') vhost = models.BooleanField(null=True, blank=True, help_text='虚拟主机标志') matched_gf_patterns = ArrayField( models.CharField(max_length=100), @@ -267,10 +272,10 @@ class EndpointSnapshot(models.Model): default=list, help_text='匹配的GF模式列表' ) - response_headers = models.JSONField( + response_headers = models.TextField( blank=True, - default=dict, - help_text='HTTP响应头(JSON格式)' + default='', + help_text='原始HTTP响应头' ) created_at = models.DateTimeField(auto_now_add=True, help_text='创建时间') @@ -288,7 +293,12 @@ class EndpointSnapshot(models.Model): models.Index(fields=['webserver']), # webserver索引,优化服务器搜索 models.Index(fields=['-created_at']), GinIndex(fields=['tech']), # GIN索引,优化数组字段查询 - GinIndex(fields=['response_headers']), # GIN索引,优化 JSON 字段查询 + # pg_trgm GIN 索引,支持 LIKE '%keyword%' 模糊搜索 + GinIndex( + name='ep_snap_resp_hdr_trgm', + fields=['response_headers'], + opclasses=['gin_trgm_ops'] + ), ] constraints = [ # 唯一约束:同一次扫描中,同一个URL只能记录一次 diff --git a/backend/apps/asset/repositories/asset/endpoint_repository.py b/backend/apps/asset/repositories/asset/endpoint_repository.py index 295f2831..0e639bfb 100644 --- a/backend/apps/asset/repositories/asset/endpoint_repository.py +++ b/backend/apps/asset/repositories/asset/endpoint_repository.py @@ -48,13 +48,13 @@ class DjangoEndpointRepository: status_code=item.status_code, content_length=item.content_length, webserver=item.webserver or '', - body_preview=item.body_preview or '', + response_body=item.response_body or '', content_type=item.content_type or '', tech=item.tech if item.tech else [], vhost=item.vhost, location=item.location or '', matched_gf_patterns=item.matched_gf_patterns if item.matched_gf_patterns else [], - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' ) for item in unique_items ] @@ -66,7 +66,7 @@ class DjangoEndpointRepository: unique_fields=['url', 'target'], update_fields=[ 'host', 'title', 'status_code', 'content_length', - 'webserver', 'body_preview', 'content_type', 'tech', + 'webserver', 'response_body', 'content_type', 'tech', 'vhost', 'location', 'matched_gf_patterns', 'response_headers' ], batch_size=1000 @@ -139,13 +139,13 @@ class DjangoEndpointRepository: status_code=item.status_code, content_length=item.content_length, webserver=item.webserver or '', - body_preview=item.body_preview or '', + response_body=item.response_body or '', content_type=item.content_type or '', tech=item.tech if item.tech else [], vhost=item.vhost, location=item.location or '', matched_gf_patterns=item.matched_gf_patterns if item.matched_gf_patterns else [], - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' ) for item in unique_items ] @@ -185,7 +185,7 @@ class DjangoEndpointRepository: .values( 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'matched_gf_patterns', 'created_at' + 'response_body', 'response_headers', 'vhost', 'matched_gf_patterns', 'created_at' ) .order_by('url') ) diff --git a/backend/apps/asset/repositories/asset/website_repository.py b/backend/apps/asset/repositories/asset/website_repository.py index 2f0842b5..99185f1a 100644 --- a/backend/apps/asset/repositories/asset/website_repository.py +++ b/backend/apps/asset/repositories/asset/website_repository.py @@ -49,13 +49,13 @@ class DjangoWebSiteRepository: location=item.location or '', title=item.title or '', webserver=item.webserver or '', - body_preview=item.body_preview or '', + response_body=item.response_body or '', content_type=item.content_type or '', tech=item.tech if item.tech else [], status_code=item.status_code, content_length=item.content_length, vhost=item.vhost, - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' ) for item in unique_items ] @@ -67,7 +67,7 @@ class DjangoWebSiteRepository: unique_fields=['url', 'target'], update_fields=[ 'host', 'location', 'title', 'webserver', - 'body_preview', 'content_type', 'tech', + 'response_body', 'content_type', 'tech', 'status_code', 'content_length', 'vhost', 'response_headers' ], batch_size=1000 @@ -133,13 +133,13 @@ class DjangoWebSiteRepository: location=item.location or '', title=item.title or '', webserver=item.webserver or '', - body_preview=item.body_preview or '', + response_body=item.response_body or '', content_type=item.content_type or '', tech=item.tech if item.tech else [], status_code=item.status_code, content_length=item.content_length, vhost=item.vhost, - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' ) for item in unique_items ] @@ -179,7 +179,7 @@ class DjangoWebSiteRepository: .values( 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'created_at' + 'response_body', 'response_headers', 'vhost', 'created_at' ) .order_by('url') ) diff --git a/backend/apps/asset/repositories/snapshot/endpoint_snapshot_repository.py b/backend/apps/asset/repositories/snapshot/endpoint_snapshot_repository.py index 3ef07bcf..edfb30b9 100644 --- a/backend/apps/asset/repositories/snapshot/endpoint_snapshot_repository.py +++ b/backend/apps/asset/repositories/snapshot/endpoint_snapshot_repository.py @@ -52,10 +52,10 @@ class DjangoEndpointSnapshotRepository: webserver=item.webserver, content_type=item.content_type, tech=item.tech if item.tech else [], - body_preview=item.body_preview, + response_body=item.response_body, vhost=item.vhost, matched_gf_patterns=item.matched_gf_patterns if item.matched_gf_patterns else [], - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' )) # 批量创建(忽略冲突,基于唯一约束去重) @@ -102,7 +102,7 @@ class DjangoEndpointSnapshotRepository: .values( 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'matched_gf_patterns', 'created_at' + 'response_body', 'response_headers', 'vhost', 'matched_gf_patterns', 'created_at' ) .order_by('url') ) diff --git a/backend/apps/asset/repositories/snapshot/website_snapshot_repository.py b/backend/apps/asset/repositories/snapshot/website_snapshot_repository.py index fd5d8d81..f776be59 100644 --- a/backend/apps/asset/repositories/snapshot/website_snapshot_repository.py +++ b/backend/apps/asset/repositories/snapshot/website_snapshot_repository.py @@ -52,9 +52,9 @@ class DjangoWebsiteSnapshotRepository: web_server=item.web_server, content_type=item.content_type, tech=item.tech if item.tech else [], - body_preview=item.body_preview, + response_body=item.response_body, vhost=item.vhost, - response_headers=item.response_headers if item.response_headers else {} + response_headers=item.response_headers if item.response_headers else '' )) # 批量创建(忽略冲突,基于唯一约束去重) @@ -101,7 +101,7 @@ class DjangoWebsiteSnapshotRepository: .values( 'url', 'host', 'location', 'title', 'status', 'content_length', 'content_type', 'web_server', 'tech', - 'body_preview', 'vhost', 'created_at' + 'response_body', 'response_headers', 'vhost', 'created_at' ) .order_by('url') ) @@ -118,7 +118,8 @@ class DjangoWebsiteSnapshotRepository: 'content_type': row['content_type'], 'webserver': row['web_server'], 'tech': row['tech'], - 'body_preview': row['body_preview'], + 'response_body': row['response_body'], + 'response_headers': row['response_headers'], 'vhost': row['vhost'], 'created_at': row['created_at'], } diff --git a/backend/apps/asset/serializers.py b/backend/apps/asset/serializers.py index e6cd8e8b..d3e0cd29 100644 --- a/backend/apps/asset/serializers.py +++ b/backend/apps/asset/serializers.py @@ -70,7 +70,7 @@ class WebSiteSerializer(serializers.ModelSerializer): """站点序列化器(目标详情页)""" subdomain = serializers.CharField(source='subdomain.name', allow_blank=True, default='') - responseHeaders = serializers.JSONField(source='response_headers', read_only=True) # HTTP响应头 + responseHeaders = serializers.CharField(source='response_headers', read_only=True) # 原始HTTP响应头 class Meta: model = WebSite @@ -84,7 +84,7 @@ class WebSiteSerializer(serializers.ModelSerializer): 'content_type', 'status_code', 'content_length', - 'body_preview', + 'response_body', 'tech', 'vhost', 'responseHeaders', # HTTP响应头 @@ -142,7 +142,7 @@ class EndpointListSerializer(serializers.ModelSerializer): source='matched_gf_patterns', read_only=True, ) - responseHeaders = serializers.JSONField(source='response_headers', read_only=True) # HTTP响应头 + responseHeaders = serializers.CharField(source='response_headers', read_only=True) # 原始HTTP响应头 class Meta: model = Endpoint @@ -155,7 +155,7 @@ class EndpointListSerializer(serializers.ModelSerializer): 'content_length', 'content_type', 'webserver', - 'body_preview', + 'response_body', 'tech', 'vhost', 'responseHeaders', # HTTP响应头 @@ -219,7 +219,7 @@ class WebsiteSnapshotSerializer(serializers.ModelSerializer): subdomain_name = serializers.CharField(source='subdomain.name', read_only=True) webserver = serializers.CharField(source='web_server', read_only=True) # 映射字段名 status_code = serializers.IntegerField(source='status', read_only=True) # 映射字段名 - responseHeaders = serializers.JSONField(source='response_headers', read_only=True) # HTTP响应头 + responseHeaders = serializers.CharField(source='response_headers', read_only=True) # 原始HTTP响应头 class Meta: model = WebsiteSnapshot @@ -232,7 +232,7 @@ class WebsiteSnapshotSerializer(serializers.ModelSerializer): 'content_type', 'status_code', # 使用映射后的字段名 'content_length', - 'body_preview', + 'response_body', 'tech', 'vhost', 'responseHeaders', # HTTP响应头 @@ -270,7 +270,7 @@ class EndpointSnapshotSerializer(serializers.ModelSerializer): source='matched_gf_patterns', read_only=True, ) - responseHeaders = serializers.JSONField(source='response_headers', read_only=True) # HTTP响应头 + responseHeaders = serializers.CharField(source='response_headers', read_only=True) # 原始HTTP响应头 class Meta: model = EndpointSnapshot @@ -284,7 +284,7 @@ class EndpointSnapshotSerializer(serializers.ModelSerializer): 'content_type', 'status_code', 'content_length', - 'body_preview', + 'response_body', 'tech', 'vhost', 'responseHeaders', # HTTP响应头 diff --git a/backend/apps/asset/views.py b/backend/apps/asset/views.py index 9ea238ba..74d89e1c 100644 --- a/backend/apps/asset/views.py +++ b/backend/apps/asset/views.py @@ -367,7 +367,7 @@ class WebSiteViewSet(viewsets.ModelViewSet): def export(self, request, **kwargs): """导出网站为 CSV 格式 - CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, body_preview, vhost, created_at + CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, response_body, response_headers, vhost, created_at """ from apps.common.utils import generate_csv_rows, format_datetime, format_list_field @@ -380,7 +380,7 @@ class WebSiteViewSet(viewsets.ModelViewSet): headers = [ 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'created_at' + 'response_body', 'response_headers', 'vhost', 'created_at' ] formatters = { 'created_at': format_datetime, @@ -628,7 +628,7 @@ class EndpointViewSet(viewsets.ModelViewSet): def export(self, request, **kwargs): """导出端点为 CSV 格式 - CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, body_preview, vhost, matched_gf_patterns, created_at + CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, response_body, response_headers, vhost, matched_gf_patterns, created_at """ from apps.common.utils import generate_csv_rows, format_datetime, format_list_field @@ -641,7 +641,7 @@ class EndpointViewSet(viewsets.ModelViewSet): headers = [ 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'matched_gf_patterns', 'created_at' + 'response_body', 'response_headers', 'vhost', 'matched_gf_patterns', 'created_at' ] formatters = { 'created_at': format_datetime, @@ -853,7 +853,7 @@ class WebsiteSnapshotViewSet(viewsets.ModelViewSet): def export(self, request, **kwargs): """导出网站快照为 CSV 格式 - CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, body_preview, vhost, created_at + CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, response_body, response_headers, vhost, created_at """ from apps.common.utils import generate_csv_rows, format_datetime, format_list_field @@ -866,7 +866,7 @@ class WebsiteSnapshotViewSet(viewsets.ModelViewSet): headers = [ 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'created_at' + 'response_body', 'response_headers', 'vhost', 'created_at' ] formatters = { 'created_at': format_datetime, @@ -970,7 +970,7 @@ class EndpointSnapshotViewSet(viewsets.ModelViewSet): def export(self, request, **kwargs): """导出端点快照为 CSV 格式 - CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, body_preview, vhost, matched_gf_patterns, created_at + CSV 列:url, host, location, title, status_code, content_length, content_type, webserver, tech, response_body, response_headers, vhost, matched_gf_patterns, created_at """ from apps.common.utils import generate_csv_rows, format_datetime, format_list_field @@ -983,7 +983,7 @@ class EndpointSnapshotViewSet(viewsets.ModelViewSet): headers = [ 'url', 'host', 'location', 'title', 'status_code', 'content_length', 'content_type', 'webserver', 'tech', - 'body_preview', 'vhost', 'matched_gf_patterns', 'created_at' + 'response_body', 'response_headers', 'vhost', 'matched_gf_patterns', 'created_at' ] formatters = { 'created_at': format_datetime, diff --git a/backend/apps/scan/configs/command_templates.py b/backend/apps/scan/configs/command_templates.py index 348549a0..d3252214 100644 --- a/backend/apps/scan/configs/command_templates.py +++ b/backend/apps/scan/configs/command_templates.py @@ -97,9 +97,10 @@ SITE_SCAN_COMMANDS = { 'base': ( "'{scan_tools_base}/httpx' -l '{url_file}' " '-status-code -content-type -content-length ' - '-location -title -server -body-preview ' + '-location -title -server ' '-tech-detect -cdn -vhost ' - '-include-response-header ' + '-include-response ' + '-rstr 2000 ' '-random-agent -no-color -json -silent' ), 'optional': { @@ -170,9 +171,10 @@ URL_FETCH_COMMANDS = { 'base': ( "'{scan_tools_base}/httpx' -l '{url_file}' " '-status-code -content-type -content-length ' - '-location -title -server -body-preview ' + '-location -title -server ' '-tech-detect -cdn -vhost ' - '-include-response-header ' + '-include-response ' + '-rstr 2000 ' '-random-agent -no-color -json -silent' ), 'optional': { diff --git a/backend/apps/scan/configs/engine_config_example.yaml b/backend/apps/scan/configs/engine_config_example.yaml index a243c2ed..6eec2ff5 100644 --- a/backend/apps/scan/configs/engine_config_example.yaml +++ b/backend/apps/scan/configs/engine_config_example.yaml @@ -81,7 +81,7 @@ site_scan: fingerprint_detect: # ==================== 指纹识别 ==================== - # 在 site_scan 后串行执行,识别 WebSite 的技术栈 + # 在 站点扫描 后串行执行,识别 WebSite 的技术栈 tools: xingfinger: enabled: true diff --git a/backend/apps/scan/tasks/fingerprint_detect/run_xingfinger_task.py b/backend/apps/scan/tasks/fingerprint_detect/run_xingfinger_task.py index 7bd8aad2..899c2f80 100644 --- a/backend/apps/scan/tasks/fingerprint_detect/run_xingfinger_task.py +++ b/backend/apps/scan/tasks/fingerprint_detect/run_xingfinger_task.py @@ -119,10 +119,10 @@ def bulk_merge_website_fields( insert_sql = f""" INSERT INTO {table_name} ( target_id, url, host, location, title, webserver, - body_preview, content_type, tech, status_code, content_length, + response_body, content_type, tech, status_code, content_length, response_headers, created_at ) - VALUES (%s, %s, %s, '', %s, %s, '', '', %s::varchar[], %s, %s, '{{}}'::jsonb, NOW()) + VALUES (%s, %s, %s, '', %s, %s, '', '', %s::varchar[], %s, %s, '', NOW()) ON CONFLICT (target_id, url) DO UPDATE SET tech = (SELECT ARRAY(SELECT DISTINCT unnest( COALESCE({table_name}.tech, ARRAY[]::varchar[]) || EXCLUDED.tech diff --git a/backend/apps/scan/tasks/site_scan/run_and_stream_save_websites_task.py b/backend/apps/scan/tasks/site_scan/run_and_stream_save_websites_task.py index 935c30e1..0e865e83 100644 --- a/backend/apps/scan/tasks/site_scan/run_and_stream_save_websites_task.py +++ b/backend/apps/scan/tasks/site_scan/run_and_stream_save_websites_task.py @@ -129,12 +129,12 @@ class HttpxRecord: self.content_type = data.get('content_type', '') self.location = data.get('location', '') self.webserver = data.get('webserver', '') - self.body_preview = data.get('body_preview', '') + self.response_body = data.get('body', '') # 从 body 字段获取完整响应体 self.tech = data.get('tech', []) self.vhost = data.get('vhost') self.failed = data.get('failed', False) self.timestamp = data.get('timestamp') - self.response_headers = data.get('header', {}) # 响应头(httpx 输出的 header 字段) + self.response_headers = data.get('raw_header', '') # 从 raw_header 字段获取原始响应头字符串 # 从 URL 中提取主机名 self.host = self._extract_hostname() @@ -355,13 +355,13 @@ def _save_batch( location=record.location, # location 字段保存重定向信息 title=record.title[:1000] if record.title else '', web_server=record.webserver[:200] if record.webserver else '', - body_preview=record.body_preview[:1000] if record.body_preview else '', + response_body=record.response_body if record.response_body else '', content_type=record.content_type[:200] if record.content_type else '', tech=record.tech if isinstance(record.tech, list) else [], status=record.status_code, content_length=record.content_length, vhost=record.vhost, - response_headers=record.response_headers if record.response_headers else {}, + response_headers=record.response_headers if record.response_headers else '', ) snapshot_items.append(snapshot_dto) diff --git a/backend/apps/scan/tasks/url_fetch/run_and_stream_save_urls_task.py b/backend/apps/scan/tasks/url_fetch/run_and_stream_save_urls_task.py index 781b13d6..320bb4da 100644 --- a/backend/apps/scan/tasks/url_fetch/run_and_stream_save_urls_task.py +++ b/backend/apps/scan/tasks/url_fetch/run_and_stream_save_urls_task.py @@ -110,9 +110,9 @@ def _parse_and_validate_line(line: str) -> Optional[dict]: 'webserver': _sanitize_string(line_data.get('webserver', '')), 'location': _sanitize_string(line_data.get('location', '')), 'tech': line_data.get('tech', []), - 'body_preview': _sanitize_string(line_data.get('body_preview', '')), + 'response_body': _sanitize_string(line_data.get('body', '')), 'vhost': line_data.get('vhost', False), - 'response_headers': line_data.get('header', {}), + 'response_headers': _sanitize_string(line_data.get('raw_header', '')), } except Exception: @@ -299,11 +299,11 @@ def _save_batch( webserver=record.get('webserver', ''), content_type=record.get('content_type', ''), tech=record.get('tech', []), - body_preview=record.get('body_preview', ''), + response_body=record.get('response_body', ''), vhost=record.get('vhost', False), matched_gf_patterns=[], target_id=target_id, - response_headers=record.get('response_headers', {}), + response_headers=record.get('response_headers', ''), ) snapshots.append(dto) except Exception as e: diff --git a/backend/scripts/generate_test_data_sql.py b/backend/scripts/generate_test_data_sql.py index 0dcd801d..f3a75a43 100644 --- a/backend/scripts/generate_test_data_sql.py +++ b/backend/scripts/generate_test_data_sql.py @@ -180,6 +180,28 @@ def get_db_config() -> dict: } +def generate_raw_response_headers(headers_dict: dict) -> str: + """ + 将响应头字典转换为原始 HTTP 响应头字符串格式 + + Args: + headers_dict: 响应头字典 + + Returns: + 原始 HTTP 响应头字符串,格式如: + HTTP/1.1 200 OK + Server: nginx + Content-Type: text/html + ... + """ + lines = ['HTTP/1.1 200 OK'] + for key, value in headers_dict.items(): + # 将下划线转换为连字符,并首字母大写 + header_name = key.replace('_', '-').title() + lines.append(f'{header_name}: {value}') + return '\r\n'.join(lines) + + DB_CONFIG = get_db_config() @@ -812,7 +834,7 @@ class TestDataGenerator: ] # 真实的 body preview 内容 - body_previews = [ + response_bodies = [ '