IDS with ML Source: Github

This project aimed to explore temporal network traffic data to extract relevant features that differentiate network intrusion from normal traffic. An autoencoder was first used to perform dimensionality reduction before utilising a simple ANN to classify the results into malicious or not.

You can read more at the following pages as well: STEPS Showcase and Poster

Code Snippet 1: Preprocessing


# to find the average number of rows in each file
for parent_dir in file_list:
    start = 0
    count = 0
    for f in os.listdir(parent_dir):
        with open(os.path.join(parent_dir, f), 'r') as fp:
            if not f.endswith(".json"):
                continue
            jsonstr = json.load(fp)
        print(str(parent_dir) + "/" + str(f))
        start = start + 1
        if (count < num_of_test_files+num_of_val_files):
            count += 1
            continue
        d1 = preprocess(jsonstr)
        row += d1.shape[0]

        # 1 for attack, 0 for normal data
        if parent_dir in normal_dirs:
            d1.insert(d1.shape[1], "class", [0] * d1.shape[0], True)
            print("Normal")
            print(parent_dir)
        else:
            d1.insert(d1.shape[1], "class", [1] * d1.shape[0], True)
            print("Attack")
            print(parent_dir)

        for i in d1:
            if (i in freq):
                freq[i] += 1.0
            else:
                freq[i]  = 1.0
        count += 1
        if limit_files and start > limit:
            break
avg_row = row // count

# finding which columns to keep
for i in freq:
    if not ((freq[i]/count) < 0.5) and not ("ip" in i) and not ("epoch" in i):
        approved_columns.append(i)
print("Approved columns length: ", len(approved_columns))
np.save("approved_columns.npy", approved_columns)

Code Snippet 2: Model


with session:
    # define model
    timesteps = stacked_df.shape[1]
    n_features = stacked_df.shape[2]
    val_stacked_df = tf.convert_to_tensor(val_stacked_df, np.float64)
    test_stacked_df = tf.convert_to_tensor(test_stacked_df, np.float64)
    stacked_df = tf.convert_to_tensor(stacked_df, np.float64) #uncomment if you are not normalizing above

    model = Sequential()
    model.add(Masking(mask_value=-1., input_shape=(stacked_df.shape[1], stacked_df.shape[2])))
    model.add(LSTM(150, return_sequences=True))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dropout(0.2))
    model.add(LSTM(100, return_sequences=True))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False, name='bottleneck'))
    model.add(LeakyReLU(alpha=0.05))
    print("Encoder added")
    model.add(RepeatVector(timesteps))
    print("Adding Decoder")
    model.add(LSTM(50, return_sequences=True))
    model.add(LeakyReLU(alpha=0.05))
    model.add(LSTM(100, return_sequences=True))
    model.add(LeakyReLU(alpha=0.05))
    model.add(LSTM(150, return_sequences=True))
    model.add(LeakyReLU(alpha=0.05))
    model.add(TimeDistributed(Dense(n_features)))
    adam = keras.optimizers.Adam()
    model.compile(optimizer=adam, loss='mse',  metrics=["accuracy"])
    print(model.summary())
    mc1 = ModelCheckpoint('model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)