1

2025-03-09 14:57:42 +08:00
parent 10c56952c6
commit 51f929abfa
5 changed files with 527 additions and 50 deletions
--- a/ai_anti_malware/ml.cpp
+++ b/ai_anti_malware/ml.cpp
@@ -340,6 +340,7 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
        peInfo.characteristics = ntHeaders64->FileHeader.Characteristics;
        peInfo.dllCharacteristics =
            ntHeaders64->OptionalHeader.DllCharacteristics;
+        peInfo.hasImageBase = ntHeaders64->OptionalHeader.ImageBase != 0;
    } else {
        // 32位PE文件
        PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
@@ -352,6 +353,7 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
        peInfo.characteristics = ntHeaders32->FileHeader.Characteristics;
        peInfo.dllCharacteristics =
            ntHeaders32->OptionalHeader.DllCharacteristics;
+        peInfo.hasImageBase = ntHeaders32->OptionalHeader.ImageBase != 0;
    }

    // 检查PE目录
@@ -398,8 +400,6 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
                                          IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT);
    peInfo.hasDelayImports = dataDir && dataDir->VirtualAddress != 0;

-    peInfo.hasImageBase = true;  // PE文件都有ImageBase
-
    dataDir = peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IAT);
    peInfo.hasEntryIat = dataDir && dataDir->VirtualAddress != 0;

@@ -544,9 +544,12 @@ std::vector<double> MachineLearning::EncodeEntrypoint(
    const std::vector<uint8_t>& epBytes) {
    std::vector<double> features;

+    // 只使用前64个字节，确保特征数量固定
+    size_t bytesToUse = std::min<size_t>(64, epBytes.size());
+
    // 原始字节转为浮点值（按Python代码中的normalize处理）
-    for (const auto& byte : epBytes) {
-        features.push_back(static_cast<double>(byte) / 255.0);
+    for (size_t i = 0; i < bytesToUse; i++) {
+        features.push_back(static_cast<double>(epBytes[i]) / 255.0);
    }

    // 填充至64字节长度
@@ -743,34 +746,49 @@ std::vector<uint8_t> MachineLearning::ReadFileToBuffer(

 bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
                                       const std::string& outputCsvPath) {
-    // 打开CSV文件用于写入
-    std::ofstream csvFile(outputCsvPath);
+    // 检查文件是否已存在
+    bool fileExists = std::filesystem::exists(outputCsvPath);
+
+    // 打开CSV文件用于写入，如果文件已存在则使用追加模式
+    std::ofstream csvFile;
+    if (fileExists) {
+        csvFile.open(outputCsvPath, std::ios::app);
+    } else {
+        csvFile.open(outputCsvPath);
+    }
+
    if (!csvFile.is_open()) {
-        std::cerr << "无法创建CSV文件: " << outputCsvPath << std::endl;
+        std::cerr << "无法创建或打开CSV文件: " << outputCsvPath << std::endl;
        return false;
    }
+
+    // 仅在文件不存在时写入CSV标题行
    /*
-    // 写入CSV标题行
-    csvFile << "文件路径";
-    for (size_t i = 0; i < _properties.size(); i++) {
-        csvFile << ",属性_" << i;
+   if (!fileExists) {
+
+       // 写入CSV标题行
+       csvFile << "文件路径";
+       for (size_t i = 0; i < _properties.size(); i++) {
+           csvFile << ",属性_" << i;
+       }
+       for (size_t i = 0; i < _libraries.size(); i++) {
+           csvFile << ",库_" << i;
+       }
+       csvFile << ",文件熵";
+       for (size_t i = 0; i < 64; i++) {  // 前64个字节特征
+           csvFile << ",EP_" << i;
+       }
+       csvFile << ",节区数";
+       csvFile << ",平均熵";
+       csvFile << ",最大熵";
+       csvFile << ",归一化平均熵";
+       csvFile << ",节区大小比率";
+       csvFile << ",代码比率";
+       csvFile << ",节区计数";
+       csvFile << std::endl;
+
    }
-    for (size_t i = 0; i < _libraries.size(); i++) {
-        csvFile << ",库_" << i;
-    }
-    csvFile << ",文件熵";
-    for (size_t i = 0; i < 64; i++) {  // 前64个字节特征
-        csvFile << ",EP_" << i;
-    }
-    csvFile << ",节区数";
-    csvFile << ",平均熵";
-    csvFile << ",最大熵";
-    csvFile << ",归一化平均熵";
-    csvFile << ",节区大小比率";
-    csvFile << ",代码比率";
-    csvFile << ",节区计数";
-    csvFile << std::endl;
-    */
+ */
    // 递归遍历目录
    WIN32_FIND_DATAA findData;
    std::string searchPath = directoryPath + "\\*";