本小节是示例如何通过神经网络检测Java溢出攻击,工作原理如下
1、数据集
本小节基于ADFA-LD数据集,处理源码如下
def load_one_flle(filename): x=[] with open(filename) as f: line=f.readline() line=line.strip('n') return linedef load_adfa_training_files(rootdir): x=[] y=[] list = os.listdir(rootdir) for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) if os.path.isfile(path): x.append(load_one_flle(path)) print("Load file(%s)" % path) y.append(0) return x,ydef dirlist(path, allfile): filelist = os.listdir(path) for filename in filelist: filepath = os.path.join(path, filename) if os.path.isdir(filepath): dirlist(filepath, allfile) else: allfile.append(filepath) return allfiledef load_adfa_java_files(rootdir): x=[] y=[] allfile=dirlist(rootdir,[]) for file in allfile: if re.match(r"../data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_",file): print("Load file(%s)" % file) x.append(load_one_flle(file)) y.append(1) return x,y if __name__ == '__main__': x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/") x2,y2=load_adfa_java_files("../data/ADFA-LD/Attack_Data_Master/") x=x1+x2 y=y1+y2
2、特征化
本小节使用词集模型进行特征化,代码如下所示
vectorizer = CountVectorizer(min_df=1) x=vectorizer.fit_transform(x) x=x.toarray()
3、完整代码
# -*- coding:utf-8 -*-import refrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn import model_selectionimport osimport numpy as npfrom sklearn.neural_network import MLPClassifierdef load_one_flle(filename): x=[] with open(filename) as f: line=f.readline() line=line.strip('n') return linedef load_adfa_training_files(rootdir): x=[] y=[] list = os.listdir(rootdir) for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) if os.path.isfile(path): x.append(load_one_flle(path)) print("Load file(%s)" % path) y.append(0) return x,ydef dirlist(path, allfile): filelist = os.listdir(path) for filename in filelist: filepath = os.path.join(path, filename) if os.path.isdir(filepath): dirlist(filepath, allfile) else: allfile.append(filepath) return allfiledef load_adfa_java_files(rootdir): x=[] y=[] allfile=dirlist(rootdir,[]) for file in allfile: if re.match(r"../data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_",file): print("Load file(%s)" % file) x.append(load_one_flle(file)) y.append(1) return x,yif __name__ == '__main__': x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/") x2,y2=load_adfa_java_files("../data/ADFA-LD/Attack_Data_Master/") x=x1+x2 y=y1+y2 vectorizer = CountVectorizer(min_df=1) x=vectorizer.fit_transform(x) x=x.toarray() mlp = MLPClassifier(hidden_layer_sizes=(150,50), max_iter=10, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1) score=model_selection.cross_val_score(mlp, x, y, n_jobs=1, cv=10) print(score) print(np.mean(score))
4、运行结果
这里使用十折交叉验证,测试结果准确率为87%,效果一般般啊,毕竟正常样本833个,黑样本124个,一共才 957个,这个准确率确实不理想啊
[0.86597938 0.86597938 0.87628866 0.86458333 0.87368421 0.87368421 0.87368421 0.87368421 0.87368421 0.87368421]0.871493601917164