In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from pandas import Series,DataFrame
from nfstream import NFStreamer

Import pcap data¶

In [2]:
normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()
In [3]:
attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()
In [4]:
normal_df
Out[4]:
id expiration_id src_ip src_mac src_oui src_port dst_ip dst_mac dst_oui dst_port ... src2dst_rst_packets src2dst_fin_packets dst2src_syn_packets dst2src_cwr_packets dst2src_ece_packets dst2src_urg_packets dst2src_ack_packets dst2src_psh_packets dst2src_rst_packets dst2src_fin_packets
0 0 0 84.3.251.103 fa:00:bc:90:d7:fa fa:00:bc 36925 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 6 1 1 1
1 1 0 84.3.251.105 fe:bb:16:7b:c3:27 fe:bb:16 42083 84.3.251.103 fa:00:bc:90:d7:fa fa:00:bc 502 ... 0 1 2 0 0 0 5 1 0 1
2 2 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 53511 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
3 3 0 84.3.251.104 4a:35:83:e0:3d:a4 4a:35:83 35023 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 5 1 0 1
4 4 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 38055 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
262 262 0 84.3.251.103 fa:00:bc:90:d7:fa fa:00:bc 34935 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 5 1 0 1
263 263 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 40745 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
264 264 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 60719 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
265 265 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 36137 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
266 266 0 84.3.251.104 4a:35:83:e0:3d:a4 4a:35:83 34199 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 5 1 0 1

267 rows × 77 columns

Label and combine the dataframe¶

In [5]:
normal_df['Label'] = 0
attack_df['Label'] = 1
In [6]:
combined_df = pd.concat([normal_df, attack_df], axis = 0)

Get dummies for categorical features & remove NaN¶

In [7]:
combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)
In [8]:
# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')
Total Rows:  5146501725905314.0
Detected NaN:  0 0.0 %

Set X & y¶

In [9]:
X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']
In [10]:
X.shape  # 530 rows, 101 columns
Out[10]:
(530, 101)

Here we use mutual information from sklearn¶

In [11]:
selector = SelectKBest(mutual_info_classif, k=20)
X_reduced = selector.fit_transform(X,y)
X_reduced.shape  # note the reduced shape from 101 to 20
Out[11]:
(530, 20)

Which features were kept?¶

In [12]:
cols = selector.get_support(indices=True)
selected_cols = X.iloc[:,cols].columns.tolist()
selected_cols
Out[12]:
['protocol',
 'ip_version',
 'bidirectional_first_seen_ms',
 'bidirectional_last_seen_ms',
 'src2dst_first_seen_ms',
 'src2dst_last_seen_ms',
 'src2dst_duration_ms',
 'dst2src_first_seen_ms',
 'dst2src_last_seen_ms',
 'dst2src_duration_ms',
 'bidirectional_stddev_piat_ms',
 'src2dst_mean_piat_ms',
 'src2dst_stddev_piat_ms',
 'src2dst_max_piat_ms',
 'dst2src_min_piat_ms',
 'src2dst_ack_packets',
 'dst2src_rst_packets',
 'src_ip_84.3.251.20',
 'src_mac_74:46:a0:bd:a7:1b',
 'dst_oui_fa:00:bc']