Dataset-2.py

# -*- coding: utf-8 -*-
"""
Automatically generated by Colaboratory.

# *Required* python Library
"""

import pandas as pd 
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics,svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

!pip install kaggle

"""#Please attach Kaggel API Json file for downloading the dataset"""

#@title
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

"""#Downloading the Dataset from Kaggel"""

!kaggle datasets download -d preeti5607/ddos-attack-prevention
!unzip ddos-attack-prevention

"""#Load the file and create a Pandas DataFrame"""

raw_df = pd.concat(map(pd.read_csv, ['dataset_attack.csv', 'dataset_normal.csv']))
raw_df.dataframeName = 'dataset.csv'
nRow, nCol = raw_df.shape
print(f'There are {nRow} rows and {nCol} columns')

raw_df.info()

"""List of the Dataset classifiers of ML model Building"""

raw_df['tcp.time_delta'].unique()

raw_df = raw_df.assign(
      result = lambda dataframe: dataframe['tcp.time_delta'].map(lambda val: 'ATTACK' if val == 'attack' else 'NORMAL') 
  )

"""#Balance Distribuation check of the dataset classifiers"""

px.histogram(raw_df,x='result',color='result').show()

"""#Preprocessing of the Dataset for ML Model """

#@title
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

clean_df=handle_non_numerical_data(raw_df)
clean_df.info()
clean_df.nunique(axis=0)

feature_column = ['frame.encap_type','frame.len','frame.protocols','ip.hdr_len','ip.len','ip.flags.rb','ip.flags.df','p.flags.mf','ip.frag_offset','ip.ttl','ip.proto','ip.src','ip.dst','tcp.srcport','tcp.dstport','tcp.len','tcp.ack','tcp.flags.res','tcp.flags.ns','tcp.flags.cwr','tcp.flags.ecn','tcp.flags.urg','tcp.flags.ack','tcp.flags.push','tcp.flags.reset','tcp.flags.syn','tcp.flags.fin','tcp.window_size']
X = clean_df[feature_column] 
Y = clean_df['result']
print(X.shape)
print(Y.shape)

"""Spliting the data for the Traning in to 70-30 ratio for Traning and Testing"""

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=3) 

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

"""# **List of models**

**1. Decision Tree**:
Decision Trees are a type of Supervised Machine Learning (you explain what the input is and what the corresponding output is in the training data) in which the data is continuously split based on a specific parameter.

**2. KNN : k-nearest neighbors**:
KNN is an abbreviation for "K-Nearest Neighbour." It is a machine learning algorithm that is supervised. The algorithm can solve classification and regression problem statements. The symbol 'K' represents the number of nearest neighbors to a new unknown variable that must be predicted or classified.

**3. Random Forest**:
Random forests, also known as random decision forests, are an ensemble learning method for classification, regression, and other tasks that works by constructing a large number of decision trees during training. For classification tasks, the random forest output is the class chosen by the majority of trees.

**4. SVM : Support vector machine**:
Support-vector machines are supervised learning models that analyze data for classification and regression analysis using learning algorithms.

**5. Neural Network**:
Neural networks, also known as artificial neural networks (ANNs) or simulated neural networks (SNNs), are a subset of machine learning that form the foundation of deep learning algorithms. Their name and structure are inspired by the human brain, and they mimic the way biological neurons communicate with one another.

**6. GBT : Gradient boosting tree (*Advance ML Model*)**
Gradient boosting is a machine learning technique that is commonly used in regression and classification tasks. It returns a prediction model in the form of an ensemble of weak prediction models, usually decision trees.

Temp List to store all the requied Model Data
"""

master_models=[] #[model_name,ml_model]
master_preduction=[] #[model_name,model_fit]
master_model_result=[] #[model_name,Accuracy, Precision, Detection  rate, False positive rate]

k=3 #Hyper paramaters 
master_models.insert(-1,["Decision Tree",DecisionTreeClassifier()])
master_models.insert(-1,["KNeighbors",KNeighborsClassifier(n_neighbors=k)])
master_models.insert(-1,["RandomForest",RandomForestClassifier()])
master_models.insert(-1,["Support vector machine",svm.SVC(kernel='linear')])
master_models.insert(-1,["Neural Network",MLPClassifier(hidden_layer_sizes=(21,21,21), activation='relu', solver='adam', max_iter=500)])
master_models.insert(-1,["Gradient boosting",GradientBoostingClassifier(random_state=0)])

"""MODELS Building"""

for name,model in master_models:
    print("Building model : ",name)
    model.fit(X_train,y_train)
    master_preduction.insert(-1,[name,model.predict(X_test)])

"""# ML Model Result """

def detection_rate_score(cm):
    # tp/(tp+fn)
    return cm[0][0]/(cm[0][0]+cm[1][0])
def false_positive_score(cm):
    # fp/(fp+tn)
    return cm[0][1]/(cm[0][1]+cm[1][1])

for name,predict in master_preduction:
    tmp_cm = confusion_matrix(y_test,predict)
    #Detection  rate
    #False positive rate
    master_model_result.insert(-1,[name,accuracy_score(y_test,predict),precision_score(y_test,predict),detection_rate_score(tmp_cm),false_positive_score(tmp_cm)])
    tmp_cm=0

"""Model Comparison DataFrame"""

result_df = pd.DataFrame(master_model_result,columns=["model_name","Accuracy", "Precision", "Detection  rate", "False positive rate"])
display(result_df)

"""Model Feature Importance"""

importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))