Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added neo_dolfin/ai/Fraud Detection/Class Dist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added neo_dolfin/ai/Fraud Detection/Model Plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added neo_dolfin/ai/Fraud Detection/TrainLoss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
300 changes: 300 additions & 0 deletions neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
# -*- coding: utf-8 -*-
"""DolFin_FraudDetection.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1buWFaz7KlnwVV2mM-ZlE8VSTKh10Amkc

# DolFin Fraudulent Transaction Model
The purpose of this code is to develop a fraud detection model, which monitors a number of features to determine if a transaction is fraud. As the DolFin team receive more data they will be able to combine it with this dataset or add/remove features from the provided one.

The code has been designed to run off Google Colab, be sure to ensure that the dataset directory is updated and the file is located there.

## Import relevent libaries
"""

# Import utilities
import numpy as np
import pandas as pd
import datetime
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Deep Learning
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, regularizers
from sklearn.preprocessing import StandardScaler

# Import Google Drive to access the file
from google.colab import drive
drive.mount('/content/drive')

# Color scheme specified by AI Team Lead
colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be']

"""## Importing the Dataset"""

# Specify the dataset location
# Dataset from https://www.kaggle.com/datasets/sauravmishraa/frauddataset
dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Fraud/'
file_name = 'CreditCardData.csv'

# Read the dataset and store it in a dataframe
df = pd.read_csv(dataset_dir + file_name)

# Shuffle the data
df.sample(frac=1).reset_index(drop=True)

# Display a list of column names
labels = list(df.columns)

# Determine the number of samples and features
num_samples = df.shape[0]
num_features = df.shape[1]

# Display this information in the terminal
msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n'
msg += f'The column names are: {labels}'
print(msg)

"""## Feature Conditioning
This is to ensure that the currency sign is removed, categorical columns converted to integers, ext...
"""

# Ensure the Amount dataframe is a string datatype
df['Amount'] = df['Amount'].astype('string')
def remove_currency(text):
if isinstance(text, str):
return text.replace('£','')
else:
return text

# Apply the remove currency transformation
df['Amount'] = df['Amount'].apply(remove_currency)

# Specify the categorical columns
categorical_columns = ['Day of Week', 'Type of Card', 'Entry Mode',
'Type of Transaction', 'Merchant Group',
'Country of Transaction', 'Country of Residence',
'Gender', 'Bank']

# Apply the categorical transformation
df[categorical_columns] = df[categorical_columns].astype('category')

# Replace any missing values with 0
df['Amount'].fillna(0, inplace=True)

# Convert the numerical columns to float 32
numerical_columns = ['Amount', 'Age']

# Apply the float32 transformation
df[numerical_columns] = df[numerical_columns].astype('float32')

# Remove the features that will hinder the DNN or won't be avalible
remove_columns = ['Date', 'Time', 'Transaction ID', 'Shipping Address']
df = df.drop(columns=remove_columns, axis=1)

# Convert the categorical features to one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns)

# Apply scaling to the numerical features of the dataframe
scalar = StandardScaler()
df[numerical_columns] = scalar.fit_transform(df[numerical_columns])

# Ensure the numerical columns are floats
df[numerical_columns] = df[numerical_columns].astype('float')

# Ensure that the Fraud label is the last column
df['Fraud'] = df.pop('Fraud')

"""## Train / Test and Split the Data
To increase training size and limit errors, I ensured the datasets were in the TensorFlow dataset format.
"""

# Convert the values to int, I had to do this to resolve an error
df = df.astype(int)

# Split the dataset into train and test.
test_split = 0.2
train_ds, test_ds = train_test_split(df, test_size=test_split, random_state=42)

# Convert to numpy array
train_ds = train_ds.to_numpy()
test_ds = test_ds.to_numpy()

# Split into features and labels
x_train = train_ds[:, :-1]
y_train = train_ds[:, -1]
x_test = test_ds[:, :-1]
y_test = test_ds[:, -1]

# Convert into tensorflow datasets
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))

# Shuffle the datasets and batch
batch_size = 32
train_ds = train_ds.shuffle(buffer_size=len(train_ds)).batch(batch_size)
test_ds = test_ds.shuffle(buffer_size=len(test_ds)).batch(batch_size)

# Prefetch the train and test datasets
train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

"""## Data Exploration"""

# Visualisation of dataset
def describe_labels(y_train, y_test):
fig, ax = plt.subplots(1, 2, figsize=(16,6))
# colors = sns.color_palette('deep')
colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be']

train_unique_labels = np.unique(y_train, return_counts=True)
train_labels = train_unique_labels[0]
train_label_count = train_unique_labels[1]
ax[0].bar(x=train_labels, height=train_label_count, color=colors)
ax[0].set_title('Class Count')

ax[1].pie(x=train_label_count, labels=train_labels, colors=colors, pctdistance=0.8, autopct='%1.1f%%')
ax[1].set_title('Class Distribution')

plt.colorbar
plt.tight_layout()
plt.show()
return None

describe_labels(y_train, y_test)

"""## Model Development
A deep neural net was developed, and then regulation was added to improve model generalability.
"""

# Develop a function to build and compile the model
def build_model(num_features, num_classes):

model = keras.Sequential([
layers.Dense(128, activation='relu', input_shape=(num_features,),
kernel_regularizer=regularizers.l2(0.01)),
layers.Dropout(0.2),
layers.Dense(64, activation='relu',
kernel_regularizer=regularizers.l2(0.01)),
layers.Dropout(0.2), # Consistent dropout rate
layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])

return model

# Create the input arguments for the model
num_features = df.shape[1]
num_classes = 2

# Build the model for use
model = build_model(num_features-1, num_classes)
model.summary()

# Plot the model architecture
keras.utils.plot_model(model)

"""## Model Testing
Tensorboard was used to record the training runs.
"""

# Commented out IPython magic to ensure Python compatibility.
# Create a tensorboard callback
log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Create a early stopping callback to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=1)

# Store the callbacks in a list
tf_callbacks = [tensorboard_callback, early_stopping]

# Display tensorboard
# %load_ext tensorboard
# %tensorboard --logdir logs

# Specify the maximum number of epochs
num_epochs=10

# Fit the model to the training data
history = model.fit(train_ds, epochs=num_epochs, batch_size=batch_size
, callbacks=tf_callbacks)

# Save a copy of the model
model.save_weights('fraud_model.h5')

print(f"Type of x_train: {type(x_train)}")
print(f"Type of y_train: {type(y_train)}")

"""## Model Validation
The model performed well due to the large amount of training data and only trained for a few epoch before early stopping kicked in.
"""

# Create a function to display the training accuracy and loss
def plt_accuracy_loss(history):
# Plot the training history
accuracy = history.history['accuracy']
loss = history.history['loss']
epochs = range(len(accuracy))

figure, ax = plt.subplots(2, 1, figsize=(12, 8))

colors = sns.color_palette("crest", n_colors=2)

ax[0].plot(epochs, accuracy, '-o', color=colors[0])
ax[0].set_title('Training Accuracy')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy [%]')

ax[1].plot(epochs, loss, '-o', color=colors[1])
ax[1].set_title('Training Loss')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss [%]')

plt.tight_layout()
plt.show()

plt_accuracy_loss(history)

def confusion_matrix_heatmap(y_true, y_pred, labels, name):
# Compute the confusion matrix;
num_classes = 2
conf_matrix = tf.math.confusion_matrix(labels=y_true, predictions=y_pred,
num_classes=num_classes)

# Plotting the confusion matrix using seaborn
plt.figure(figsize=(12, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=colors,
xticklabels=labels,
yticklabels=labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title(f'{name} Confusion Matrix')
plt.show()

# Extract true labels from the dataset
name_2 = 'DolFin Fraudulant Transaction Detection'

# Predict using the model
y_pred = model.predict(test_ds)
y_pred = np.argmax(y_pred, axis=1)
labels = ['Normal Transaction','Fraudulant Transaction']

confusion_matrix_heatmap(y_test, y_pred, labels, name_2)

# Calculate the model accuracy
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Accuracy is {accuracy:.2f}')