update
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#include "ml.h"
|
||||
#include <Windows.h>
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
@@ -7,6 +8,7 @@
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <cfloat>
|
||||
#include <filesystem>
|
||||
|
||||
// 确保std命名空间中的函数可用
|
||||
using std::max;
|
||||
@@ -177,15 +179,14 @@ MachineLearning::~MachineLearning() {
|
||||
// 析构函数,清理资源(如有必要)
|
||||
}
|
||||
|
||||
bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
const std::string& outputPath) {
|
||||
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
||||
size_t bufferSize) {
|
||||
// 使用libpeconv解析PE文件
|
||||
size_t v_size = 0;
|
||||
BYTE* peBuffer = peconv::load_pe_module(const_cast<BYTE*>(buffer),
|
||||
bufferSize, v_size, false, false);
|
||||
if (!peBuffer) {
|
||||
std::cerr << "无法加载PE文件" << std::endl;
|
||||
return false;
|
||||
return std::vector<double>();
|
||||
}
|
||||
|
||||
// 解析PE信息
|
||||
@@ -202,7 +203,7 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
|
||||
if (!ntHeaders) {
|
||||
peconv::free_pe_buffer(peBuffer);
|
||||
return false;
|
||||
return std::vector<double>();
|
||||
}
|
||||
|
||||
// 从NT头部获取信息
|
||||
@@ -392,13 +393,10 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
// 7. 节区数量
|
||||
allFeatures.push_back(static_cast<double>(sections.size()));
|
||||
|
||||
// 导出特征到CSV
|
||||
bool result = ExportToCSV(allFeatures, outputPath);
|
||||
|
||||
// 清理资源
|
||||
peconv::free_pe_buffer(peBuffer);
|
||||
|
||||
return result;
|
||||
return allFeatures;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeProperties(
|
||||
@@ -588,4 +586,124 @@ MachineLearning::GetOpcodeStatistics(const uint8_t* data, size_t dataSize,
|
||||
bool isX64, const PeInfo& peInfo) {
|
||||
// 此函数未使用,但保留实现接口
|
||||
return std::make_tuple(std::vector<double>(), std::vector<int>());
|
||||
}
|
||||
|
||||
std::vector<uint8_t> MachineLearning::ReadFileToBuffer(
|
||||
const std::string& filePath) {
|
||||
std::ifstream fileStream(filePath, std::ios::binary | std::ios::ate);
|
||||
if (!fileStream.is_open()) {
|
||||
std::cerr << "无法打开文件: " << filePath << std::endl;
|
||||
return std::vector<uint8_t>();
|
||||
}
|
||||
|
||||
// 获取文件大小
|
||||
std::streamsize fileSize = fileStream.tellg();
|
||||
fileStream.seekg(0, std::ios::beg);
|
||||
|
||||
// 分配缓冲区并读取文件
|
||||
std::vector<uint8_t> buffer(fileSize);
|
||||
if (!fileStream.read(reinterpret_cast<char*>(buffer.data()), fileSize)) {
|
||||
std::cerr << "读取文件失败: " << filePath << std::endl;
|
||||
return std::vector<uint8_t>();
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||
const std::string& outputCsvPath) {
|
||||
// 打开CSV文件用于写入
|
||||
std::ofstream csvFile(outputCsvPath);
|
||||
if (!csvFile.is_open()) {
|
||||
std::cerr << "无法创建CSV文件: " << outputCsvPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
// 写入CSV标题行
|
||||
csvFile << "文件路径";
|
||||
for (size_t i = 0; i < _properties.size(); i++) {
|
||||
csvFile << ",属性_" << i;
|
||||
}
|
||||
for (size_t i = 0; i < _libraries.size(); i++) {
|
||||
csvFile << ",库_" << i;
|
||||
}
|
||||
csvFile << ",文件熵";
|
||||
for (size_t i = 0; i < 64; i++) { // 前64个字节特征
|
||||
csvFile << ",EP_" << i;
|
||||
}
|
||||
csvFile << ",节区数";
|
||||
csvFile << ",平均熵";
|
||||
csvFile << ",最大熵";
|
||||
csvFile << ",归一化平均熵";
|
||||
csvFile << ",节区大小比率";
|
||||
csvFile << ",代码比率";
|
||||
csvFile << ",节区计数";
|
||||
csvFile << std::endl;
|
||||
*/
|
||||
// 递归遍历目录
|
||||
WIN32_FIND_DATAA findData;
|
||||
std::string searchPath = directoryPath + "\\*";
|
||||
HANDLE hFind = FindFirstFileA(searchPath.c_str(), &findData);
|
||||
|
||||
if (hFind == INVALID_HANDLE_VALUE) {
|
||||
std::cerr << "无法访问目录: " << directoryPath << std::endl;
|
||||
csvFile.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
int processedCount = 0;
|
||||
int failedCount = 0;
|
||||
|
||||
do {
|
||||
// 跳过 "." 和 ".." 目录
|
||||
if (strcmp(findData.cFileName, ".") == 0 ||
|
||||
strcmp(findData.cFileName, "..") == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string currentPath = directoryPath + "\\" + findData.cFileName;
|
||||
|
||||
if (findData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
|
||||
// 递归处理子目录
|
||||
ProcessDirectory(currentPath, outputCsvPath);
|
||||
} else {
|
||||
// 处理文件
|
||||
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
|
||||
if (fileBuffer.empty()) {
|
||||
std::cerr << "跳过文件: " << currentPath << " (读取失败)"
|
||||
<< std::endl;
|
||||
failedCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 提取特征
|
||||
std::vector<double> features =
|
||||
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
|
||||
if (features.empty()) {
|
||||
std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
|
||||
<< std::endl;
|
||||
failedCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 写入CSV
|
||||
csvFile << currentPath;
|
||||
for (const auto& feature : features) {
|
||||
csvFile << "," << std::fixed << std::setprecision(6) << feature;
|
||||
}
|
||||
csvFile << std::endl;
|
||||
|
||||
processedCount++;
|
||||
if (processedCount % 100 == 0) {
|
||||
std::cout << "已处理 " << processedCount << " 个文件..."
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
} while (FindNextFileA(hFind, &findData));
|
||||
|
||||
FindClose(hFind);
|
||||
csvFile.close();
|
||||
printf("ML Process Result, success count: %d fail count: %d \n",
|
||||
processedCount, failedCount);
|
||||
return true;
|
||||
}
|
||||
Reference in New Issue
Block a user