IDS with ML Source: Github
This project aimed to explore temporal network traffic data to extract relevant features that
differentiate network intrusion from normal traffic. An autoencoder was first used to perform
dimensionality reduction before utilising a simple ANN to classify the results into malicious or
not.
You can read more at the following pages as well: STEPS Showcase and Poster
Code Snippet 1: Preprocessing
# to find the average number of rows in each file
for parent_dir in file_list:
start = 0
count = 0
for f in os.listdir(parent_dir):
with open(os.path.join(parent_dir, f), 'r') as fp:
if not f.endswith(".json"):
continue
jsonstr = json.load(fp)
print(str(parent_dir) + "/" + str(f))
start = start + 1
if (count < num_of_test_files+num_of_val_files):
count += 1
continue
d1 = preprocess(jsonstr)
row += d1.shape[0]
# 1 for attack, 0 for normal data
if parent_dir in normal_dirs:
d1.insert(d1.shape[1], "class", [0] * d1.shape[0], True)
print("Normal")
print(parent_dir)
else:
d1.insert(d1.shape[1], "class", [1] * d1.shape[0], True)
print("Attack")
print(parent_dir)
for i in d1:
if (i in freq):
freq[i] += 1.0
else:
freq[i] = 1.0
count += 1
if limit_files and start > limit:
break
avg_row = row // count
# finding which columns to keep
for i in freq:
if not ((freq[i]/count) < 0.5) and not ("ip" in i) and not ("epoch" in i):
approved_columns.append(i)
print("Approved columns length: ", len(approved_columns))
np.save("approved_columns.npy", approved_columns)
Code Snippet 2: Model
with session:
# define model
timesteps = stacked_df.shape[1]
n_features = stacked_df.shape[2]
val_stacked_df = tf.convert_to_tensor(val_stacked_df, np.float64)
test_stacked_df = tf.convert_to_tensor(test_stacked_df, np.float64)
stacked_df = tf.convert_to_tensor(stacked_df, np.float64) #uncomment if you are not normalizing above
model = Sequential()
model.add(Masking(mask_value=-1., input_shape=(stacked_df.shape[1], stacked_df.shape[2])))
model.add(LSTM(150, return_sequences=True))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=True))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False, name='bottleneck'))
model.add(LeakyReLU(alpha=0.05))
print("Encoder added")
model.add(RepeatVector(timesteps))
print("Adding Decoder")
model.add(LSTM(50, return_sequences=True))
model.add(LeakyReLU(alpha=0.05))
model.add(LSTM(100, return_sequences=True))
model.add(LeakyReLU(alpha=0.05))
model.add(LSTM(150, return_sequences=True))
model.add(LeakyReLU(alpha=0.05))
model.add(TimeDistributed(Dense(n_features)))
adam = keras.optimizers.Adam()
model.compile(optimizer=adam, loss='mse', metrics=["accuracy"])
print(model.summary())
mc1 = ModelCheckpoint('model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)