import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from pandas import Series,DataFrame
from nfstream import NFStreamer
normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
normal_df
id | expiration_id | src_ip | src_mac | src_oui | src_port | dst_ip | dst_mac | dst_oui | dst_port | ... | src2dst_rst_packets | src2dst_fin_packets | dst2src_syn_packets | dst2src_cwr_packets | dst2src_ece_packets | dst2src_urg_packets | dst2src_ack_packets | dst2src_psh_packets | dst2src_rst_packets | dst2src_fin_packets | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 36925 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 1 | 1 |
1 | 1 | 0 | 84.3.251.105 | fe:bb:16:7b:c3:27 | fe:bb:16 | 42083 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
2 | 2 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 53511 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
3 | 3 | 0 | 84.3.251.104 | 4a:35:83:e0:3d:a4 | 4a:35:83 | 35023 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
4 | 4 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 38055 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
262 | 262 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 34935 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
263 | 263 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 40745 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
264 | 264 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 60719 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
265 | 265 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 36137 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
266 | 266 | 0 | 84.3.251.104 | 4a:35:83:e0:3d:a4 | 4a:35:83 | 34199 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
267 rows × 77 columns
normal_df['Label'] = 0
attack_df['Label'] = 1
combined_df = pd.concat([normal_df, attack_df], axis = 0)
combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)
# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')
Total Rows: 5146501725905314.0 Detected NaN: 0 0.0 %
X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']
X.shape # 530 rows, 101 columns
(530, 101)
selector = SelectKBest(mutual_info_classif, k=20)
X_reduced = selector.fit_transform(X,y)
X_reduced.shape # note the reduced shape from 101 to 20
(530, 20)
cols = selector.get_support(indices=True)
selected_cols = X.iloc[:,cols].columns.tolist()
selected_cols
['protocol', 'ip_version', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'bidirectional_stddev_piat_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'src2dst_ack_packets', 'dst2src_rst_packets', 'src_ip_84.3.251.20', 'src_mac_74:46:a0:bd:a7:1b', 'dst_oui_fa:00:bc']