import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import Series,DataFrame
from nfstream import NFStreamer
normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
normal_df
id | expiration_id | src_ip | src_mac | src_oui | src_port | dst_ip | dst_mac | dst_oui | dst_port | ... | src2dst_rst_packets | src2dst_fin_packets | dst2src_syn_packets | dst2src_cwr_packets | dst2src_ece_packets | dst2src_urg_packets | dst2src_ack_packets | dst2src_psh_packets | dst2src_rst_packets | dst2src_fin_packets | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 36925 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 1 | 1 |
1 | 1 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 53511 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
2 | 2 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 38055 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
3 | 3 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 34677 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
4 | 4 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 55827 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
262 | 262 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 34935 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
263 | 263 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 40745 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
264 | 264 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 60719 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
265 | 265 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 36137 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
266 | 266 | 0 | 84.3.251.104 | 4a:35:83:e0:3d:a4 | 4a:35:83 | 34199 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
267 rows × 77 columns
normal_df['Label'] = 0
attack_df['Label'] = 1
combined_df = pd.concat([normal_df, attack_df], axis = 0)
combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)
# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')
Total Rows: 5146501725905314.0 Detected NaN: 0 0.0 %
X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']
X.shape # 530 rows, 101 columns
(530, 101)
#Build a logistic regression model
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=20, step=1) # Use RFE to select the top 20 features
fit = rfe.fit(X, y)
pd.set_option('display.max_rows', None)
#Create a dataframe for the results
df_RFE_results = []
for i in range(X.shape[1]):
df_RFE_results.append(
{
'Feature_names': X.columns[i],
'Selected': rfe.support_[i],
'RFE_ranking': rfe.ranking_[i],
}
)
df_RFE_results = pd.DataFrame(df_RFE_results)
df_RFE_results.sort_values(by=['RFE_ranking'])
Feature_names | Selected | RFE_ranking | |
---|---|---|---|
13 | src2dst_first_seen_ms | True | 1 |
18 | dst2src_first_seen_ms | True | 1 |
17 | src2dst_bytes | True | 1 |
44 | dst2src_mean_piat_ms | True | 1 |
15 | src2dst_duration_ms | True | 1 |
14 | src2dst_last_seen_ms | True | 1 |
37 | bidirectional_stddev_piat_ms | True | 1 |
12 | bidirectional_bytes | True | 1 |
42 | src2dst_max_piat_ms | True | 1 |
19 | dst2src_last_seen_ms | True | 1 |
10 | bidirectional_duration_ms | True | 1 |
8 | bidirectional_first_seen_ms | True | 1 |
45 | dst2src_stddev_piat_ms | True | 1 |
46 | dst2src_max_piat_ms | True | 1 |
22 | dst2src_bytes | True | 1 |
41 | src2dst_stddev_piat_ms | True | 1 |
2 | src_port | True | 1 |
38 | bidirectional_max_piat_ms | True | 1 |
9 | bidirectional_last_seen_ms | True | 1 |
20 | dst2src_duration_ms | True | 1 |
40 | src2dst_mean_piat_ms | False | 2 |
36 | bidirectional_mean_piat_ms | False | 3 |
0 | id | False | 4 |
3 | dst_port | False | 5 |
52 | bidirectional_psh_packets | False | 6 |
51 | bidirectional_ack_packets | False | 7 |
11 | bidirectional_packets | False | 8 |
60 | src2dst_psh_packets | False | 9 |
68 | dst2src_psh_packets | False | 10 |
67 | dst2src_ack_packets | False | 11 |
59 | src2dst_ack_packets | False | 12 |
21 | dst2src_packets | False | 13 |
16 | src2dst_packets | False | 14 |
30 | src2dst_max_ps | False | 15 |
26 | bidirectional_max_ps | False | 16 |
28 | src2dst_mean_ps | False | 17 |
34 | dst2src_max_ps | False | 18 |
24 | bidirectional_mean_ps | False | 19 |
27 | src2dst_min_ps | False | 20 |
32 | dst2src_mean_ps | False | 21 |
31 | dst2src_min_ps | False | 22 |
23 | bidirectional_min_ps | False | 23 |
43 | dst2src_min_piat_ms | False | 24 |
29 | src2dst_stddev_ps | False | 25 |
25 | bidirectional_stddev_ps | False | 26 |
4 | protocol | False | 27 |
47 | bidirectional_syn_packets | False | 28 |
5 | ip_version | False | 29 |
53 | bidirectional_rst_packets | False | 30 |
69 | dst2src_rst_packets | False | 31 |
55 | src2dst_syn_packets | False | 32 |
63 | dst2src_syn_packets | False | 33 |
39 | src2dst_min_piat_ms | False | 34 |
33 | dst2src_stddev_ps | False | 35 |
80 | src_mac_e6:3f:ac:c9:a8:8c | False | 36 |
86 | src_oui_e6:3f:ac | False | 37 |
91 | dst_ip_84.3.251.18 | False | 38 |
54 | bidirectional_fin_packets | False | 39 |
62 | src2dst_fin_packets | False | 40 |
70 | dst2src_fin_packets | False | 41 |
35 | bidirectional_min_piat_ms | False | 42 |
89 | dst_ip_84.3.251.102 | False | 43 |
93 | dst_mac_0a:fe:ec:47:74:fb | False | 44 |
97 | dst_oui_0a:fe:ec | False | 45 |
72 | src_ip_84.3.251.103 | False | 46 |
87 | src_oui_fa:00:bc | False | 47 |
81 | src_mac_fa:00:bc:90:d7:fa | False | 48 |
90 | dst_ip_84.3.251.103 | False | 49 |
96 | dst_mac_fa:00:bc:90:d7:fa | False | 50 |
100 | dst_oui_fa:00:bc | False | 51 |
74 | src_ip_84.3.251.105 | False | 52 |
88 | src_oui_fe:bb:16 | False | 53 |
82 | src_mac_fe:bb:16:7b:c3:27 | False | 54 |
78 | src_mac_4a:35:83:e0:3d:a4 | False | 55 |
73 | src_ip_84.3.251.104 | False | 56 |
84 | src_oui_4a:35:83 | False | 57 |
95 | dst_mac_e6:3f:ac:c9:a8:8c | False | 58 |
99 | dst_oui_e6:3f:ac | False | 59 |
77 | src_mac_0a:fe:ec:47:74:fb | False | 60 |
83 | src_oui_0a:fe:ec | False | 61 |
71 | src_ip_84.3.251.102 | False | 62 |
85 | src_oui_74:46:a0 | False | 63 |
79 | src_mac_74:46:a0:bd:a7:1b | False | 64 |
76 | src_ip_84.3.251.20 | False | 65 |
92 | dst_ip_84.3.251.20 | False | 66 |
94 | dst_mac_74:46:a0:bd:a7:1b | False | 67 |
75 | src_ip_84.3.251.18 | False | 68 |
98 | dst_oui_74:46:a0 | False | 69 |
66 | dst2src_urg_packets | False | 70 |
7 | tunnel_id | False | 71 |
6 | vlan_id | False | 72 |
65 | dst2src_ece_packets | False | 73 |
64 | dst2src_cwr_packets | False | 74 |
58 | src2dst_urg_packets | False | 75 |
57 | src2dst_ece_packets | False | 76 |
56 | src2dst_cwr_packets | False | 77 |
1 | expiration_id | False | 78 |
48 | bidirectional_cwr_packets | False | 79 |
61 | src2dst_rst_packets | False | 80 |
49 | bidirectional_ece_packets | False | 81 |
50 | bidirectional_urg_packets | False | 82 |