update

2025-03-09 03:19:40 +08:00
parent 1cea516cf7
commit defe59ffe8
7 changed files with 337 additions and 59 deletions
--- a/ai_anti_malware/ml.cpp
+++ b/ai_anti_malware/ml.cpp
@@ -1,4 +1,5 @@
 #include "ml.h"
+#include <Windows.h>
 #include <array>
 #include <limits>
 #include <algorithm>
@@ -7,6 +8,7 @@
 #include <iomanip>
 #include <sstream>
 #include <cfloat>
+#include <filesystem>

 // 确保std命名空间中的函数可用
 using std::max;
@@ -177,15 +179,14 @@ MachineLearning::~MachineLearning() {
    // 析构函数，清理资源（如有必要）
 }

-bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
-                                      const std::string& outputPath) {
+std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
+                                                     size_t bufferSize) {
    // 使用libpeconv解析PE文件
    size_t v_size = 0;
    BYTE* peBuffer = peconv::load_pe_module(const_cast<BYTE*>(buffer),
                                            bufferSize, v_size, false, false);
    if (!peBuffer) {
-        std::cerr << "无法加载PE文件" << std::endl;
-        return false;
+        return std::vector<double>();
    }

    // 解析PE信息
@@ -202,7 +203,7 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
        (PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
    if (!ntHeaders) {
        peconv::free_pe_buffer(peBuffer);
-        return false;
+        return std::vector<double>();
    }

    // 从NT头部获取信息
@@ -392,13 +393,10 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
    // 7. 节区数量
    allFeatures.push_back(static_cast<double>(sections.size()));

-    // 导出特征到CSV
-    bool result = ExportToCSV(allFeatures, outputPath);
-
    // 清理资源
    peconv::free_pe_buffer(peBuffer);

-    return result;
+    return allFeatures;
 }

 std::vector<double> MachineLearning::EncodeProperties(
@@ -588,4 +586,124 @@ MachineLearning::GetOpcodeStatistics(const uint8_t* data, size_t dataSize,
                                     bool isX64, const PeInfo& peInfo) {
    // 此函数未使用，但保留实现接口
    return std::make_tuple(std::vector<double>(), std::vector<int>());
+}
+
+std::vector<uint8_t> MachineLearning::ReadFileToBuffer(
+    const std::string& filePath) {
+    std::ifstream fileStream(filePath, std::ios::binary | std::ios::ate);
+    if (!fileStream.is_open()) {
+        std::cerr << "无法打开文件: " << filePath << std::endl;
+        return std::vector<uint8_t>();
+    }
+
+    // 获取文件大小
+    std::streamsize fileSize = fileStream.tellg();
+    fileStream.seekg(0, std::ios::beg);
+
+    // 分配缓冲区并读取文件
+    std::vector<uint8_t> buffer(fileSize);
+    if (!fileStream.read(reinterpret_cast<char*>(buffer.data()), fileSize)) {
+        std::cerr << "读取文件失败: " << filePath << std::endl;
+        return std::vector<uint8_t>();
+    }
+
+    return buffer;
+}
+
+bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
+                                       const std::string& outputCsvPath) {
+    // 打开CSV文件用于写入
+    std::ofstream csvFile(outputCsvPath);
+    if (!csvFile.is_open()) {
+        std::cerr << "无法创建CSV文件: " << outputCsvPath << std::endl;
+        return false;
+    }
+    /*
+    // 写入CSV标题行
+    csvFile << "文件路径";
+    for (size_t i = 0; i < _properties.size(); i++) {
+        csvFile << ",属性_" << i;
+    }
+    for (size_t i = 0; i < _libraries.size(); i++) {
+        csvFile << ",库_" << i;
+    }
+    csvFile << ",文件熵";
+    for (size_t i = 0; i < 64; i++) {  // 前64个字节特征
+        csvFile << ",EP_" << i;
+    }
+    csvFile << ",节区数";
+    csvFile << ",平均熵";
+    csvFile << ",最大熵";
+    csvFile << ",归一化平均熵";
+    csvFile << ",节区大小比率";
+    csvFile << ",代码比率";
+    csvFile << ",节区计数";
+    csvFile << std::endl;
+    */
+    // 递归遍历目录
+    WIN32_FIND_DATAA findData;
+    std::string searchPath = directoryPath + "\\*";
+    HANDLE hFind = FindFirstFileA(searchPath.c_str(), &findData);
+
+    if (hFind == INVALID_HANDLE_VALUE) {
+        std::cerr << "无法访问目录: " << directoryPath << std::endl;
+        csvFile.close();
+        return false;
+    }
+
+    int processedCount = 0;
+    int failedCount = 0;
+
+    do {
+        // 跳过 "." 和 ".." 目录
+        if (strcmp(findData.cFileName, ".") == 0 ||
+            strcmp(findData.cFileName, "..") == 0) {
+            continue;
+        }
+
+        std::string currentPath = directoryPath + "\\" + findData.cFileName;
+
+        if (findData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            // 递归处理子目录
+            ProcessDirectory(currentPath, outputCsvPath);
+        } else {
+            // 处理文件
+            std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
+            if (fileBuffer.empty()) {
+                std::cerr << "跳过文件: " << currentPath << " (读取失败)"
+                          << std::endl;
+                failedCount++;
+                continue;
+            }
+
+            // 提取特征
+            std::vector<double> features =
+                ExtractFeatures(fileBuffer.data(), fileBuffer.size());
+            if (features.empty()) {
+                std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
+                          << std::endl;
+                failedCount++;
+                continue;
+            }
+
+            // 写入CSV
+            csvFile << currentPath;
+            for (const auto& feature : features) {
+                csvFile << "," << std::fixed << std::setprecision(6) << feature;
+            }
+            csvFile << std::endl;
+
+            processedCount++;
+            if (processedCount % 100 == 0) {
+                std::cout << "已处理 " << processedCount << " 个文件..."
+                          << std::endl;
+            }
+        }
+    } while (FindNextFileA(hFind, &findData));
+
+    FindClose(hFind);
+    csvFile.close();
+    printf("ML Process Result, success count: %d fail count: %d \n",
+           processedCount, failedCount);
+    return true;
 }