diff --git a/neo_dolfin/ai/Fraud Detection/Class Dist.png b/neo_dolfin/ai/Fraud Detection/Class Dist.png new file mode 100644 index 00000000..466f0072 Binary files /dev/null and b/neo_dolfin/ai/Fraud Detection/Class Dist.png differ diff --git a/neo_dolfin/ai/Fraud Detection/Model Plot.png b/neo_dolfin/ai/Fraud Detection/Model Plot.png new file mode 100644 index 00000000..75e0715c Binary files /dev/null and b/neo_dolfin/ai/Fraud Detection/Model Plot.png differ diff --git a/neo_dolfin/ai/Fraud Detection/TrainLoss.png b/neo_dolfin/ai/Fraud Detection/TrainLoss.png new file mode 100644 index 00000000..7b58b490 Binary files /dev/null and b/neo_dolfin/ai/Fraud Detection/TrainLoss.png differ diff --git a/neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py b/neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py new file mode 100644 index 00000000..b0afc546 --- /dev/null +++ b/neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +"""DolFin_FraudDetection.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1buWFaz7KlnwVV2mM-ZlE8VSTKh10Amkc + +# DolFin Fraudulent Transaction Model +The purpose of this code is to develop a fraud detection model, which monitors a number of features to determine if a transaction is fraud. As the DolFin team receive more data they will be able to combine it with this dataset or add/remove features from the provided one. + +The code has been designed to run off Google Colab, be sure to ensure that the dataset directory is updated and the file is located there. + +## Import relevent libaries +""" + +# Import utilities +import numpy as np +import pandas as pd +import datetime +import os +import warnings +import matplotlib.pyplot as plt +import seaborn as sns +warnings.filterwarnings('ignore') + +# Deep Learning +from sklearn.model_selection import train_test_split +from tensorflow.keras.utils import to_categorical +from sklearn.preprocessing import LabelEncoder +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers, Model, regularizers +from sklearn.preprocessing import StandardScaler + +# Import Google Drive to access the file +from google.colab import drive +drive.mount('/content/drive') + +# Color scheme specified by AI Team Lead +colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be'] + +"""## Importing the Dataset""" + +# Specify the dataset location +# Dataset from https://www.kaggle.com/datasets/sauravmishraa/frauddataset +dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Fraud/' +file_name = 'CreditCardData.csv' + +# Read the dataset and store it in a dataframe +df = pd.read_csv(dataset_dir + file_name) + +# Shuffle the data +df.sample(frac=1).reset_index(drop=True) + +# Display a list of column names +labels = list(df.columns) + +# Determine the number of samples and features +num_samples = df.shape[0] +num_features = df.shape[1] + +# Display this information in the terminal +msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n' +msg += f'The column names are: {labels}' +print(msg) + +"""## Feature Conditioning +This is to ensure that the currency sign is removed, categorical columns converted to integers, ext... +""" + +# Ensure the Amount dataframe is a string datatype +df['Amount'] = df['Amount'].astype('string') +def remove_currency(text): + if isinstance(text, str): + return text.replace('£','') + else: + return text + +# Apply the remove currency transformation +df['Amount'] = df['Amount'].apply(remove_currency) + +# Specify the categorical columns +categorical_columns = ['Day of Week', 'Type of Card', 'Entry Mode', + 'Type of Transaction', 'Merchant Group', + 'Country of Transaction', 'Country of Residence', + 'Gender', 'Bank'] + +# Apply the categorical transformation +df[categorical_columns] = df[categorical_columns].astype('category') + +# Replace any missing values with 0 +df['Amount'].fillna(0, inplace=True) + +# Convert the numerical columns to float 32 +numerical_columns = ['Amount', 'Age'] + +# Apply the float32 transformation +df[numerical_columns] = df[numerical_columns].astype('float32') + +# Remove the features that will hinder the DNN or won't be avalible +remove_columns = ['Date', 'Time', 'Transaction ID', 'Shipping Address'] +df = df.drop(columns=remove_columns, axis=1) + +# Convert the categorical features to one-hot encoding +df = pd.get_dummies(df, columns=categorical_columns) + +# Apply scaling to the numerical features of the dataframe +scalar = StandardScaler() +df[numerical_columns] = scalar.fit_transform(df[numerical_columns]) + +# Ensure the numerical columns are floats +df[numerical_columns] = df[numerical_columns].astype('float') + +# Ensure that the Fraud label is the last column +df['Fraud'] = df.pop('Fraud') + +"""## Train / Test and Split the Data +To increase training size and limit errors, I ensured the datasets were in the TensorFlow dataset format. +""" + +# Convert the values to int, I had to do this to resolve an error +df = df.astype(int) + +# Split the dataset into train and test. +test_split = 0.2 +train_ds, test_ds = train_test_split(df, test_size=test_split, random_state=42) + +# Convert to numpy array +train_ds = train_ds.to_numpy() +test_ds = test_ds.to_numpy() + +# Split into features and labels +x_train = train_ds[:, :-1] +y_train = train_ds[:, -1] +x_test = test_ds[:, :-1] +y_test = test_ds[:, -1] + +# Convert into tensorflow datasets +train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)) + +# Shuffle the datasets and batch +batch_size = 32 +train_ds = train_ds.shuffle(buffer_size=len(train_ds)).batch(batch_size) +test_ds = test_ds.shuffle(buffer_size=len(test_ds)).batch(batch_size) + +# Prefetch the train and test datasets +train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) +test_ds = test_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + +"""## Data Exploration""" + +# Visualisation of dataset +def describe_labels(y_train, y_test): + fig, ax = plt.subplots(1, 2, figsize=(16,6)) +# colors = sns.color_palette('deep') + colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be'] + + train_unique_labels = np.unique(y_train, return_counts=True) + train_labels = train_unique_labels[0] + train_label_count = train_unique_labels[1] + ax[0].bar(x=train_labels, height=train_label_count, color=colors) + ax[0].set_title('Class Count') + + ax[1].pie(x=train_label_count, labels=train_labels, colors=colors, pctdistance=0.8, autopct='%1.1f%%') + ax[1].set_title('Class Distribution') + + plt.colorbar + plt.tight_layout() + plt.show() + return None + +describe_labels(y_train, y_test) + +"""## Model Development +A deep neural net was developed, and then regulation was added to improve model generalability. +""" + +# Develop a function to build and compile the model +def build_model(num_features, num_classes): + + model = keras.Sequential([ + layers.Dense(128, activation='relu', input_shape=(num_features,), + kernel_regularizer=regularizers.l2(0.01)), + layers.Dropout(0.2), + layers.Dense(64, activation='relu', + kernel_regularizer=regularizers.l2(0.01)), + layers.Dropout(0.2), # Consistent dropout rate + layers.Dense(1, activation='sigmoid') + ]) + + # Compile the model + model.compile(loss='binary_crossentropy', optimizer='adam', + metrics=['accuracy']) + + return model + +# Create the input arguments for the model +num_features = df.shape[1] +num_classes = 2 + +# Build the model for use +model = build_model(num_features-1, num_classes) +model.summary() + +# Plot the model architecture +keras.utils.plot_model(model) + +"""## Model Testing +Tensorboard was used to record the training runs. +""" + +# Commented out IPython magic to ensure Python compatibility. +# Create a tensorboard callback +log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) +tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) + +# Create a early stopping callback to prevent overfitting +early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=1) + +# Store the callbacks in a list +tf_callbacks = [tensorboard_callback, early_stopping] + +# Display tensorboard +# %load_ext tensorboard +# %tensorboard --logdir logs + +# Specify the maximum number of epochs +num_epochs=10 + +# Fit the model to the training data +history = model.fit(train_ds, epochs=num_epochs, batch_size=batch_size + , callbacks=tf_callbacks) + +# Save a copy of the model +model.save_weights('fraud_model.h5') + +print(f"Type of x_train: {type(x_train)}") +print(f"Type of y_train: {type(y_train)}") + +"""## Model Validation +The model performed well due to the large amount of training data and only trained for a few epoch before early stopping kicked in. +""" + +# Create a function to display the training accuracy and loss +def plt_accuracy_loss(history): + # Plot the training history + accuracy = history.history['accuracy'] + loss = history.history['loss'] + epochs = range(len(accuracy)) + + figure, ax = plt.subplots(2, 1, figsize=(12, 8)) + + colors = sns.color_palette("crest", n_colors=2) + + ax[0].plot(epochs, accuracy, '-o', color=colors[0]) + ax[0].set_title('Training Accuracy') + ax[0].set_xlabel('Epochs') + ax[0].set_ylabel('Accuracy [%]') + + ax[1].plot(epochs, loss, '-o', color=colors[1]) + ax[1].set_title('Training Loss') + ax[1].set_xlabel('Epochs') + ax[1].set_ylabel('Loss [%]') + + plt.tight_layout() + plt.show() + +plt_accuracy_loss(history) + +def confusion_matrix_heatmap(y_true, y_pred, labels, name): + # Compute the confusion matrix; + num_classes = 2 + conf_matrix = tf.math.confusion_matrix(labels=y_true, predictions=y_pred, + num_classes=num_classes) + + # Plotting the confusion matrix using seaborn + plt.figure(figsize=(12, 6)) + sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=colors, + xticklabels=labels, + yticklabels=labels) + plt.xlabel('Predicted Labels') + plt.ylabel('True Labels') + plt.title(f'{name} Confusion Matrix') + plt.show() + +# Extract true labels from the dataset +name_2 = 'DolFin Fraudulant Transaction Detection' + +# Predict using the model +y_pred = model.predict(test_ds) +y_pred = np.argmax(y_pred, axis=1) +labels = ['Normal Transaction','Fraudulant Transaction'] + +confusion_matrix_heatmap(y_test, y_pred, labels, name_2) + +# Calculate the model accuracy +loss, accuracy = model.evaluate(x_test, y_test) +print(f'Test Accuracy is {accuracy:.2f}') \ No newline at end of file