# if xgboost is needed
# import sys
# !{sys.executable} -m pip install xgboost
Collecting xgboost Downloading xgboost-1.7.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 14.7 MB/s eta 0:00:0000:0100:01 Requirement already satisfied: scipy in /usr/local/lib/python3.10/site-packages (from xgboost) (1.9.2) Requirement already satisfied: numpy in /usr/local/lib/python3.10/site-packages (from xgboost) (1.23.4) Installing collected packages: xgboost Successfully installed xgboost-1.7.3 [notice] A new release of pip available: 22.2.2 -> 23.0 [notice] To update, run: python3.10 -m pip install --upgrade pip
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from pandas import Series,DataFrame
from nfstream import NFStreamer
normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
n_dissections=0,
statistical_analysis=True).to_pandas()
normal_df
id | expiration_id | src_ip | src_mac | src_oui | src_port | dst_ip | dst_mac | dst_oui | dst_port | ... | src2dst_rst_packets | src2dst_fin_packets | dst2src_syn_packets | dst2src_cwr_packets | dst2src_ece_packets | dst2src_urg_packets | dst2src_ack_packets | dst2src_psh_packets | dst2src_rst_packets | dst2src_fin_packets | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 36925 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 1 | 1 |
1 | 1 | 0 | 84.3.251.105 | fe:bb:16:7b:c3:27 | fe:bb:16 | 42083 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
2 | 2 | 0 | 84.3.251.104 | 4a:35:83:e0:3d:a4 | 4a:35:83 | 35023 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
3 | 3 | 0 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 52801 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
4 | 4 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 37867 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
262 | 262 | 0 | 84.3.251.103 | fa:00:bc:90:d7:fa | fa:00:bc | 34935 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
263 | 263 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 40745 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
264 | 264 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 60719 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
265 | 265 | 0 | 84.3.251.101 | e6:3f:ac:c9:a8:8c | e6:3f:ac | 36137 | 84.3.251.18 | 00:80:f4:03:fb:12 | 00:80:f4 | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 6 | 1 | 2 | 1 |
266 | 266 | 0 | 84.3.251.104 | 4a:35:83:e0:3d:a4 | 4a:35:83 | 34199 | 84.3.251.102 | 0a:fe:ec:47:74:fb | 0a:fe:ec | 502 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 5 | 1 | 0 | 1 |
267 rows × 77 columns
normal_df['Label'] = 0
attack_df['Label'] = 1
combined_df = pd.concat([normal_df, attack_df], axis = 0)
combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)
# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')
Total Rows: 5146501725905314.0 Detected NaN: 0 0.0 %
X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']
X.shape # 530 rows, 101 columns
(530, 101)
xbg_reg = XGBClassifier().fit(X, y)
f_importance = xbg_reg.get_booster().get_score(importance_type='gain')
f_importance
{'bidirectional_first_seen_ms': 74.43582153320312}