# if xgboost is needed
# import sys
# !{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 14.7 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: scipy in /usr/local/lib/python3.10/site-packages (from xgboost) (1.9.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/site-packages (from xgboost) (1.23.4)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3

[notice] A new release of pip available: 22.2.2 -> 23.0
[notice] To update, run: python3.10 -m pip install --upgrade pip


import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from pandas import Series,DataFrame
from nfstream import NFStreamer


normal_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/normal.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()


attack_df = NFStreamer(source="/Users/z/Python/atroposinsights-site/ai-site/assets/notebooks/attack.pcap",
                         n_dissections=0,  
                         statistical_analysis=True).to_pandas()


normal_df


normal_df['Label'] = 0
attack_df['Label'] = 1


combined_df = pd.concat([normal_df, attack_df], axis = 0)


combined_df = pd.get_dummies(combined_df, columns = ['src_ip','src_mac','src_oui','dst_ip','dst_mac','dst_oui'], drop_first=True)


# Check for NaN
combined_df.replace([np.inf, -np.inf], pd.isna, inplace=True)
combined_df = combined_df.dropna(axis=1, how='all')
rows = combined_df.sum().sum()
nans = combined_df.isnull().sum().sum()
print('Total Rows: ', rows)
print('Detected NaN: ', nans, (nans/rows*100), '%')

Total Rows:  5146501725905314.0
Detected NaN:  0 0.0 %


X = combined_df.drop(['Label'], axis=1)
y = combined_df['Label']


X.shape  # 530 rows, 101 columns

(530, 101)


xbg_reg = XGBClassifier().fit(X, y)


f_importance = xbg_reg.get_booster().get_score(importance_type='gain')
f_importance

{'bidirectional_first_seen_ms': 74.43582153320312}

	id	expiration_id	src_ip	src_mac	src_oui	src_port	dst_ip	dst_mac	dst_oui	dst_port	...	src2dst_rst_packets	src2dst_fin_packets	dst2src_syn_packets	dst2src_cwr_packets	dst2src_ece_packets	dst2src_urg_packets	dst2src_ack_packets	dst2src_psh_packets	dst2src_rst_packets	dst2src_fin_packets
0	0	0	84.3.251.103	fa:00:bc:90:d7:fa	fa:00:bc	36925	84.3.251.102	0a:fe:ec:47:74:fb	0a:fe:ec	502	...	0	1	2	0	0	0	6	1	1	1
1	1	0	84.3.251.105	fe:bb:16:7b:c3:27	fe:bb:16	42083	84.3.251.103	fa:00:bc:90:d7:fa	fa:00:bc	502	...	0	1	2	0	0	0	5	1	0	1
2	2	0	84.3.251.104	4a:35:83:e0:3d:a4	4a:35:83	35023	84.3.251.102	0a:fe:ec:47:74:fb	0a:fe:ec	502	...	0	1	2	0	0	0	5	1	0	1
3	3	0	84.3.251.102	0a:fe:ec:47:74:fb	0a:fe:ec	52801	84.3.251.101	e6:3f:ac:c9:a8:8c	e6:3f:ac	502	...	0	1	2	0	0	0	5	1	0	1
4	4	0	84.3.251.101	e6:3f:ac:c9:a8:8c	e6:3f:ac	37867	84.3.251.18	00:80:f4:03:fb:12	00:80:f4	502	...	0	1	2	0	0	0	6	1	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
262	262	0	84.3.251.103	fa:00:bc:90:d7:fa	fa:00:bc	34935	84.3.251.102	0a:fe:ec:47:74:fb	0a:fe:ec	502	...	0	1	2	0	0	0	5	1	0	1
263	263	0	84.3.251.101	e6:3f:ac:c9:a8:8c	e6:3f:ac	40745	84.3.251.18	00:80:f4:03:fb:12	00:80:f4	502	...	0	1	2	0	0	0	6	1	2	1
264	264	0	84.3.251.101	e6:3f:ac:c9:a8:8c	e6:3f:ac	60719	84.3.251.18	00:80:f4:03:fb:12	00:80:f4	502	...	0	1	2	0	0	0	6	1	2	1
265	265	0	84.3.251.101	e6:3f:ac:c9:a8:8c	e6:3f:ac	36137	84.3.251.18	00:80:f4:03:fb:12	00:80:f4	502	...	0	1	2	0	0	0	6	1	2	1
266	266	0	84.3.251.104	4a:35:83:e0:3d:a4	4a:35:83	34199	84.3.251.102	0a:fe:ec:47:74:fb	0a:fe:ec	502	...	0	1	2	0	0	0	5	1	0	1

Import pcap data¶

Label and combine the dataframe¶

Get dummies for categorical features & remove NaN¶

Set X & y¶

Here we train XGBoost¶

Which features were kept?¶