142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
|
|
# coding=utf-8
|
|||
|
|
import fileinput
|
|||
|
|
import warnings
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
import os
|
|||
|
|
import tensorflow as tf
|
|||
|
|
import sys
|
|||
|
|
import re
|
|||
|
|
import nltk
|
|||
|
|
import sklearn
|
|||
|
|
import tensorflow.keras as keras
|
|||
|
|
import tensorflow.keras.preprocessing as keras_preprocessing
|
|||
|
|
from sklearn.preprocessing import StandardScaler
|
|||
|
|
import chardet
|
|||
|
|
import math
|
|||
|
|
from joblib import load
|
|||
|
|
g_word_dict = {}
|
|||
|
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
|
|||
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|||
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
|
|||
|
|
|
|||
|
|
|
|||
|
|
def os_listdir_ex(file_dir, find_name): # 祖传代码
|
|||
|
|
result = []
|
|||
|
|
for root, dirs, files in os.walk(file_dir):
|
|||
|
|
for file in files:
|
|||
|
|
if os.path.splitext(file)[1] == find_name:
|
|||
|
|
result.append(os.path.join(root, file))
|
|||
|
|
# return result # 测试用
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_file_length(pFile): # 得到文件长度,祖传代码
|
|||
|
|
fsize = os.path.getsize(pFile)
|
|||
|
|
return int(fsize)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def flush_file(pFile): # 清洗php注释
|
|||
|
|
file = open(pFile, 'r', encoding='gb18030', errors='ignore')
|
|||
|
|
read_string = file.read()
|
|||
|
|
file.close()
|
|||
|
|
m = re.compile(r'/\*.*?\*/', re.S)
|
|||
|
|
result = re.sub(m, '', read_string)
|
|||
|
|
m = re.compile(r'//.*')
|
|||
|
|
result = re.sub(m, '', result)
|
|||
|
|
m = re.compile(r'#.*')
|
|||
|
|
result = re.sub(m, '', result)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 得到文件熵 https://blog.csdn.net/jliang3/article/details/88359063
|
|||
|
|
def get_file_entropy(pFile):
|
|||
|
|
clean_string = flush_file(pFile)
|
|||
|
|
text_list = {}
|
|||
|
|
_sum = 0
|
|||
|
|
result = 0
|
|||
|
|
for word_iter in clean_string:
|
|||
|
|
if word_iter != '\n' and word_iter != ' ':
|
|||
|
|
if word_iter not in text_list.keys():
|
|||
|
|
text_list[word_iter] = 1
|
|||
|
|
else:
|
|||
|
|
text_list[word_iter] = text_list[word_iter] + 1
|
|||
|
|
for index in text_list.keys():
|
|||
|
|
_sum = _sum + text_list[index]
|
|||
|
|
for index in text_list.keys():
|
|||
|
|
result = result - float(text_list[index])/_sum * \
|
|||
|
|
math.log(float(text_list[index])/_sum, 2)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def vectorize_sequences(sequences, dimention=1337):
|
|||
|
|
# 创建一个大小为(25000,10000)的全零矩阵
|
|||
|
|
results = np.zeros((len(sequences), dimention))
|
|||
|
|
for i, sequence in enumerate(sequences):
|
|||
|
|
if i > dimention:
|
|||
|
|
break
|
|||
|
|
try:
|
|||
|
|
results[i, sequence] = 1.
|
|||
|
|
except:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_file_word_bag(pFile):
|
|||
|
|
global g_word_dict
|
|||
|
|
english_punctuations = [',', '.', ':', ';', '?',
|
|||
|
|
'(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 'php', '<', '>', '\'']
|
|||
|
|
clean_string = flush_file(pFile)
|
|||
|
|
word_list = nltk.word_tokenize(clean_string)
|
|||
|
|
# 过滤掉不干净的
|
|||
|
|
word_list = [
|
|||
|
|
word_iter for word_iter in word_list if word_iter not in english_punctuations]
|
|||
|
|
|
|||
|
|
keras_token = keras.preprocessing.text.Tokenizer() # 初始化标注器
|
|||
|
|
keras_token.fit_on_texts(word_list) # 学习出文本的字典
|
|||
|
|
g_word_dict.update(keras_token.word_index)
|
|||
|
|
# 通过texts_to_sequences 这个dict可以将每个string的每个词转成数字
|
|||
|
|
sequences_data = keras_token.texts_to_sequences(word_list)
|
|||
|
|
# 将每条文本的长度设置一个固定值, ps 超过1337个字符的"单词"不用说肯定是某个骇客想把大马变免杀马
|
|||
|
|
# word_bag = keras_preprocessing.sequence.pad_sequences(sequences_data, maxlen=1337, dtype='int16')
|
|||
|
|
word_bag = []
|
|||
|
|
for index in range(0, len(sequences_data)):
|
|||
|
|
if len(sequences_data[index]) != 0:
|
|||
|
|
for zeus in range(0, len(sequences_data[index])):
|
|||
|
|
word_bag.append(sequences_data[index][zeus])
|
|||
|
|
return word_bag
|
|||
|
|
|
|||
|
|
|
|||
|
|
file_path = '.\\1.php'
|
|||
|
|
|
|||
|
|
entropy = get_file_entropy(file_path)
|
|||
|
|
length = get_file_length(file_path)
|
|||
|
|
word_bag = get_file_word_bag(file_path)
|
|||
|
|
array_input = np.array([[entropy, length]])
|
|||
|
|
|
|||
|
|
data_frame = pd.DataFrame(
|
|||
|
|
{'length': [length], 'entropy': [entropy], 'word_bag': [word_bag]}, columns=['length', 'entropy', 'word_bag'])
|
|||
|
|
# scaler = StandardScaler()
|
|||
|
|
scaler_entropy = load('scaler_entropy.joblib')
|
|||
|
|
scaler_length = load('scaler_length.joblib')
|
|||
|
|
data_frame['length_scaled'] = scaler_length.transform(
|
|||
|
|
data_frame['length'].values.reshape(-1, 1))
|
|||
|
|
data_frame['entropy_scaled'] = scaler_entropy.transform(
|
|||
|
|
data_frame['entropy'].values.reshape(-1, 1))
|
|||
|
|
|
|||
|
|
data_train_pre = data_frame.filter(items=['length_scaled', 'entropy_scaled'])
|
|||
|
|
# data_train_pre = data_frame.filter(items=['length', 'entropy'])
|
|||
|
|
data_train_x_1 = tf.constant(data_train_pre)
|
|||
|
|
data_train_x_2 = tf.constant(
|
|||
|
|
vectorize_sequences(data_frame['word_bag'].values))
|
|||
|
|
print(data_frame.head())
|
|||
|
|
|
|||
|
|
model_name = 'huoji1.h5' # huoji.h5 huoji_scaled.h5 huoji_no_scale.h5
|
|||
|
|
model = keras.models.load_model(model_name)
|
|||
|
|
model.summary()
|
|||
|
|
print(data_train_x_1, data_train_x_2)
|
|||
|
|
prediction = model.predict([data_train_x_1, data_train_x_2])
|
|||
|
|
print(prediction)
|