From 4d1ccb16aa3179c180117b1a25cdd634109a1fd6 Mon Sep 17 00:00:00 2001
From: Huoji's <1296564236@qq.com>
Date: Sun, 9 Mar 2025 03:29:14 +0800
Subject: [PATCH] Implement Rich Header parsing for PE file analysis

- Added ParseRichHeader method to extract Rich header information from PE files
- Defined RichEntry and RichHeaderInfo structures to store Rich header details
- Implemented decoding of Rich header entries with checksum XOR technique
- Updated ml.h and ml.cpp to support Rich header parsing
- Improved error handling and logging in ProcessDirectory method
- Translated some log messages to English for consistency
---
 ai_anti_malware/ml.cpp | 74 +++++++++++++++++++++++++++++++++++++++---
 ai_anti_malware/ml.h   | 11 ++++++-
 2 files changed, 79 insertions(+), 6 deletions(-)
diff --git a/ai_anti_malware/ml.cpp b/ai_anti_malware/ml.cpp
index 861af55..16c8320 100644
--- a/ai_anti_malware/ml.cpp
+++ b/ai_anti_malware/ml.cpp
@@ -179,6 +179,70 @@ MachineLearning::~MachineLearning() {
     // 析构函数，清理资源（如有必要）
 }
 
+bool MachineLearning::ParseRichHeader(const uint8_t* peBuffer,
+                                      RichHeaderInfo& richInfo) {
+    PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(peBuffer);
+
+    // 检查DOS头部有效性
+    if (!dosHeader || dosHeader->e_magic != IMAGE_DOS_SIGNATURE) {
+        return false;
+    }
+
+    // 搜索范围是DOS头后到PE头前
+    const uint32_t* scanPtr =
+        reinterpret_cast<const uint32_t*>(peBuffer + sizeof(IMAGE_DOS_HEADER));
+    size_t maxItems =
+        (dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER)) / sizeof(uint32_t);
+
+    // 查找DanS标记
+    size_t dansIndex = 0;
+    for (; dansIndex < maxItems - 1; dansIndex++) {
+        if (scanPtr[dansIndex] == 0x536E6144) {  // "DanS"
+            break;
+        }
+    }
+
+    if (dansIndex >= maxItems - 1) {
+        return false;  // 没找到DanS
+    }
+
+    // 获取校验和
+    uint32_t checksum = scanPtr[dansIndex + 1];
+    richInfo.checksum = checksum;
+
+    // 找Rich标记
+    size_t richIndex = 0;
+    for (richIndex = dansIndex + 2; richIndex < maxItems; richIndex++) {
+        if ((scanPtr[richIndex] ^ checksum) ==
+            0x68636952) {  // "Rich" ^ checksum
+            break;
+        }
+    }
+
+    if (richIndex >= maxItems) {
+        return false;  // 没找到Rich
+    }
+
+    // 解析Rich条目
+    // DanS之前的数据是Rich条目，每个条目占用2个DWORD
+    size_t entryCount = (richIndex - dansIndex - 2) / 2;
+    richInfo.entries.reserve(entryCount);
+
+    for (size_t i = 0; i < entryCount; i++) {
+        size_t entryPos = richIndex - 2 * (i + 1);
+        uint32_t dword1 = scanPtr[entryPos] ^ checksum;
+        uint32_t dword2 = scanPtr[entryPos + 1] ^ checksum;
+
+        RichEntry entry;
+        entry.productId = dword1 & 0xFFFF;        // 低16位是ProductId
+        entry.buildId = (dword1 >> 16) & 0xFFFF;  // 高16位是BuildId
+        entry.useCount = dword2;                  // 使用次数
+
+        richInfo.entries.push_back(entry);
+    }
+
+    return true;
+}
 std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
                                                      size_t bufferSize) {
     // 使用libpeconv解析PE文件
@@ -672,7 +736,7 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
             // 处理文件
             std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
             if (fileBuffer.empty()) {
-                std::cerr << "跳过文件: " << currentPath << " (读取失败)"
+                std::cerr << "skip file: " << currentPath << " (read failed)"
                           << std::endl;
                 failedCount++;
                 continue;
@@ -682,8 +746,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
             std::vector<double> features =
                 ExtractFeatures(fileBuffer.data(), fileBuffer.size());
             if (features.empty()) {
-                std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
-                          << std::endl;
+                std::cerr << "skip file: " << currentPath
+                          << " (can't get feature)" << std::endl;
                 failedCount++;
                 continue;
             }
@@ -697,8 +761,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
 
             processedCount++;
             if (processedCount % 100 == 0) {
-                std::cout << "已处理 " << processedCount << " 个文件..."
-                          << std::endl;
+                std::cout << "a ready processed " << processedCount
+                          << " files..." << std::endl;
             }
         }
     } while (FindNextFileA(hFind, &findData));
diff --git a/ai_anti_malware/ml.h b/ai_anti_malware/ml.h
index cd6646d..631bce6 100644
--- a/ai_anti_malware/ml.h
+++ b/ai_anti_malware/ml.h
@@ -15,7 +15,16 @@
 struct PeInfo;
 struct SectionInfo;
 class BasicPeInfo;
+struct RichEntry {
+    uint16_t productId;  // 组件ID
+    uint16_t buildId;    // 版本号
+    uint32_t useCount;   // 使用次数
+};
 
+struct RichHeaderInfo {
+    uint32_t checksum;               // 校验和
+    std::vector<RichEntry> entries;  // Rich头条目
+};
 // RVA转换为内存中的指针的辅助函数
 inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
     if (!peBuffer || rva == 0) return nullptr;
@@ -61,7 +70,7 @@ class MachineLearning {
    public:
     MachineLearning();
     ~MachineLearning();
-
+    bool ParseRichHeader(const uint8_t* peBuffer, RichHeaderInfo& richInfo);
     // 提取特征并返回特征向量
     std::vector<double> ExtractFeatures(const uint8_t* buffer,
                                         size_t bufferSize);