In [14]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from pandas import Series,DataFrame
from nfstream import NFStreamer

Import pcap data¶

In [15]:
normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()
In [16]:
attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()
In [17]:
normal_df
Out[17]:
id expiration_id src_ip src_mac src_oui src_port dst_ip dst_mac dst_oui dst_port ... src2dst_rst_packets src2dst_fin_packets dst2src_syn_packets dst2src_cwr_packets dst2src_ece_packets dst2src_urg_packets dst2src_ack_packets dst2src_psh_packets dst2src_rst_packets dst2src_fin_packets
0 0 0 84.3.251.103 fa:00:bc:90:d7:fa fa:00:bc 36925 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 6 1 1 1
1 1 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 53511 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
2 2 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 38055 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
3 3 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 34677 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
4 4 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 55827 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
262 262 0 84.3.251.103 fa:00:bc:90:d7:fa fa:00:bc 34935 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 5 1 0 1
263 263 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 40745 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
264 264 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 60719 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
265 265 0 84.3.251.101 e6:3f:ac:c9:a8:8c e6:3f:ac 36137 84.3.251.18 00:80:f4:03:fb:12 00:80:f4 502 ... 0 1 2 0 0 0 6 1 2 1
266 266 0 84.3.251.104 4a:35:83:e0:3d:a4 4a:35:83 34199 84.3.251.102 0a:fe:ec:47:74:fb 0a:fe:ec 502 ... 0 1 2 0 0 0 5 1 0 1

267 rows × 77 columns

Label and combine the dataframe¶

In [18]:
normal_df['Label'] = 0
attack_df['Label'] = 1
In [19]:
combined_df = pd.concat([normal_df, attack_df], axis = 0)

Get dummies for categorical features & remove NaN¶

In [20]:
combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)
In [21]:
# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')
Total Rows:  5146501725905314.0
Detected NaN:  0 0.0 %

Set X & y¶

In [22]:
X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']
In [23]:
X.shape  # 530 rows, 101 columns
Out[23]:
(530, 101)

Here we use RFE from sklearn¶

In [25]:
#Build a logistic regression model 
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=20, step=1)  # Use RFE to select the top 20 features
fit = rfe.fit(X, y)

Which features were kept?¶

In [39]:
pd.set_option('display.max_rows', None)
#Create a dataframe for the results 
df_RFE_results = []

for i in range(X.shape[1]):
    df_RFE_results.append(
        {      
            'Feature_names': X.columns[i],
            'Selected':  rfe.support_[i],
            'RFE_ranking':  rfe.ranking_[i],
        }
    )

df_RFE_results = pd.DataFrame(df_RFE_results)

df_RFE_results.sort_values(by=['RFE_ranking'])
Out[39]:
Feature_names Selected RFE_ranking
13 src2dst_first_seen_ms True 1
18 dst2src_first_seen_ms True 1
17 src2dst_bytes True 1
44 dst2src_mean_piat_ms True 1
15 src2dst_duration_ms True 1
14 src2dst_last_seen_ms True 1
37 bidirectional_stddev_piat_ms True 1
12 bidirectional_bytes True 1
42 src2dst_max_piat_ms True 1
19 dst2src_last_seen_ms True 1
10 bidirectional_duration_ms True 1
8 bidirectional_first_seen_ms True 1
45 dst2src_stddev_piat_ms True 1
46 dst2src_max_piat_ms True 1
22 dst2src_bytes True 1
41 src2dst_stddev_piat_ms True 1
2 src_port True 1
38 bidirectional_max_piat_ms True 1
9 bidirectional_last_seen_ms True 1
20 dst2src_duration_ms True 1
40 src2dst_mean_piat_ms False 2
36 bidirectional_mean_piat_ms False 3
0 id False 4
3 dst_port False 5
52 bidirectional_psh_packets False 6
51 bidirectional_ack_packets False 7
11 bidirectional_packets False 8
60 src2dst_psh_packets False 9
68 dst2src_psh_packets False 10
67 dst2src_ack_packets False 11
59 src2dst_ack_packets False 12
21 dst2src_packets False 13
16 src2dst_packets False 14
30 src2dst_max_ps False 15
26 bidirectional_max_ps False 16
28 src2dst_mean_ps False 17
34 dst2src_max_ps False 18
24 bidirectional_mean_ps False 19
27 src2dst_min_ps False 20
32 dst2src_mean_ps False 21
31 dst2src_min_ps False 22
23 bidirectional_min_ps False 23
43 dst2src_min_piat_ms False 24
29 src2dst_stddev_ps False 25
25 bidirectional_stddev_ps False 26
4 protocol False 27
47 bidirectional_syn_packets False 28
5 ip_version False 29
53 bidirectional_rst_packets False 30
69 dst2src_rst_packets False 31
55 src2dst_syn_packets False 32
63 dst2src_syn_packets False 33
39 src2dst_min_piat_ms False 34
33 dst2src_stddev_ps False 35
80 src_mac_e6:3f:ac:c9:a8:8c False 36
86 src_oui_e6:3f:ac False 37
91 dst_ip_84.3.251.18 False 38
54 bidirectional_fin_packets False 39
62 src2dst_fin_packets False 40
70 dst2src_fin_packets False 41
35 bidirectional_min_piat_ms False 42
89 dst_ip_84.3.251.102 False 43
93 dst_mac_0a:fe:ec:47:74:fb False 44
97 dst_oui_0a:fe:ec False 45
72 src_ip_84.3.251.103 False 46
87 src_oui_fa:00:bc False 47
81 src_mac_fa:00:bc:90:d7:fa False 48
90 dst_ip_84.3.251.103 False 49
96 dst_mac_fa:00:bc:90:d7:fa False 50
100 dst_oui_fa:00:bc False 51
74 src_ip_84.3.251.105 False 52
88 src_oui_fe:bb:16 False 53
82 src_mac_fe:bb:16:7b:c3:27 False 54
78 src_mac_4a:35:83:e0:3d:a4 False 55
73 src_ip_84.3.251.104 False 56
84 src_oui_4a:35:83 False 57
95 dst_mac_e6:3f:ac:c9:a8:8c False 58
99 dst_oui_e6:3f:ac False 59
77 src_mac_0a:fe:ec:47:74:fb False 60
83 src_oui_0a:fe:ec False 61
71 src_ip_84.3.251.102 False 62
85 src_oui_74:46:a0 False 63
79 src_mac_74:46:a0:bd:a7:1b False 64
76 src_ip_84.3.251.20 False 65
92 dst_ip_84.3.251.20 False 66
94 dst_mac_74:46:a0:bd:a7:1b False 67
75 src_ip_84.3.251.18 False 68
98 dst_oui_74:46:a0 False 69
66 dst2src_urg_packets False 70
7 tunnel_id False 71
6 vlan_id False 72
65 dst2src_ece_packets False 73
64 dst2src_cwr_packets False 74
58 src2dst_urg_packets False 75
57 src2dst_ece_packets False 76
56 src2dst_cwr_packets False 77
1 expiration_id False 78
48 bidirectional_cwr_packets False 79
61 src2dst_rst_packets False 80
49 bidirectional_ece_packets False 81
50 bidirectional_urg_packets False 82
In [ ]: