-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05_4_machine_learning_implementation_final.py
199 lines (144 loc) · 8.89 KB
/
05_4_machine_learning_implementation_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
## "all_data.csv" file is required for the operation of the program.
## "all_data.csv" file must be located in the same directory as the program.
## the purpose of this program is to apply machine learning algorithms to the dataset and observe the performance of algorithms.
## the algorithms used are:Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP, Nearest Neighbors
## As the program display output data include: file name, machine learning algorithm name, accuracy,Precision, Recall, F1-score,Time
## the program will create a CSV file that prints the results and a folder containing graphics.
## the some codes parts used for calculation and graphing are taken from the following site.
## http://scikit-learn.org
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")
def folder(f_name): #this function creates a folder named "feaure_graph" in the program directory.
try:
if not os.path.exists(f_name):
os.makedirs(f_name)
except OSError:
print ("The folder could not be created!")
result="./results/results_Final.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=10
def folder(f_name): #this function creates a folder named "results" and "result_graph_1" in the program directory.
try:
if not os.path.exists(f_name):
os.makedirs(f_name)
except OSError:
print ("The folder could not be created!")
folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_Final/"
folder(folder_name)
#list of all columns to be imported
# the 20 features selected by the file "04_2_feature_selection_for_attack_files.py" are used here. (+ Label Feature)
usecols=["Bwd Packet Length Std","Flow Bytes/s","Total Length of Fwd Packets","Fwd Packet Length Std","Flow IAT Std",
"Flow IAT Min","Fwd IAT Total","Flow Duration","Bwd Packet Length Max","Flow IAT Max","Flow IAT Mean","Total Length of Bwd Packets",
"Fwd Packet Length Min","Bwd Packet Length Mean","Flow Packets/s","Fwd Packet Length Mean","Total Backward Packets","Total Fwd Packets",
"Fwd Packet Length Max","Bwd Packet Length Min",'Label']
#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"MLP":MLPClassifier(hidden_layer_sizes=(3,3,3),max_iter=500),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"Nearest Neighbors":KNeighborsClassifier(3)}
# the features to be used for Random Forest,ID3,AdaBoost,Nearest Neighbors is defined in a list(others).
# the first 7 of the features created by the file "04_2_feature_selection_for_attack_files.py" are used here.
others=["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
"Flow IAT Std", "Flow IAT Min", "Fwd IAT Total"]
#In this part different sets of properties for machine learning methods are defined as follows:
#For "Naive Bayes", "QDA", and "MLP", each method has a different feature list.
#other algorithms (Random Forest, ID3, AdaBoost, and Nearest Neighbors) use
#the first 7 of the features created by the file "04_2_feature_selection_for_attack_files.py" are used here.
algorithms_features={"Naive Bayes":['Bwd Packet Length Std', 'Total Length of Fwd Packets', 'Flow IAT Min', 'Fwd Packet Length Min', 'Flow Packets/s', 'Fwd Packet Length Mean'] ,
"QDA":['Bwd Packet Length Std', 'Flow Bytes/s', 'Total Length of Fwd Packets', 'Flow IAT Min'],
"MLP":['Bwd Packet Length Std', 'Flow Bytes/s', 'Total Length of Fwd Packets', 'Fwd Packet Length Std',
'Flow IAT Min', 'Bwd Packet Length Max','Fwd Packet Length Min', 'Bwd Packet Length Mean',
'Total Backward Packets', 'Total Fwd Packets', 'Fwd Packet Length Max', 'Bwd Packet Length Min'],
#these algorithms use the features in the "others" list
"Random Forest":others,
"ID3" :others,
"AdaBoost":others,
"Nearest Neighbors":others}
seconds=time.time()#time stamp for all processing time
with open(result, "w", newline="",encoding="utf-8") as f:#a CSV file is created to save the results obtained.
wrt = csv.writer(f)
wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header
feature_list=usecols
df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
df=df.fillna(0)
attack_or_not=[]
for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
if i =="BENIGN":
attack_or_not.append(1)
else:
attack_or_not.append(0)
df["Label"]=attack_or_not
y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X
del df["Label"]
feature_list.remove('Label')
for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
X = df[algorithms_features[ii]]
precision=[]
recall=[]
f1=[]
accuracy=[]
t_time=[]
for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
second=time.time()#time stamp for processing time
# cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y,# data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm.
test_size = 0.20, random_state = repetition)# So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).
#machine learning algorithm is applied in this section
clf = ml_list[ii]#choose algorithm from ml_list dictionary
clf.fit(X_train, y_train)
predict =clf.predict(X_test)
#makes "classification report" and assigns the precision, f-measure, and recall values.s.
f_1=f1_score(y_test, predict, average='macro')
pr=precision_score(y_test, predict, average='macro')
rc=recall_score(y_test, predict, average='macro')
precision.append(float(pr))
recall.append(float(rc))
f1.append(float(f_1))
accuracy.append(clf.score(X_test, y_test))
t_time.append(float((time.time()-second)) )
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)),
str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the avarage result of the ten repetitions is printed on the screen.
with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
wrt = csv.writer(f)
for i in range(0,len(t_time)):
wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
# In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
plt.boxplot(f1)
plt.title("All Dataset - " +str(ii))
plt.ylabel('F-measure')
plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
plt.show()# you can remove the # sign if you want to see the graphics simultaneously
print("mission accomplished!")
print("Total operation time: = ",time.time()- seconds ,"seconds")