DataBytes-Organisation · sentientyougert · May 13, 2024
diff --git a/neo_dolfin/ai/Fraud Detection/Class Dist.png b/neo_dolfin/ai/Fraud Detection/Class Dist.png
diff --git a/neo_dolfin/ai/Fraud Detection/Model Plot.png b/neo_dolfin/ai/Fraud Detection/Model Plot.png
diff --git a/neo_dolfin/ai/Fraud Detection/TrainLoss.png b/neo_dolfin/ai/Fraud Detection/TrainLoss.png
diff --git a/neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py b/neo_dolfin/ai/Fraud Detection/dolfin_frauddetection.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+"""DolFin_FraudDetection.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1buWFaz7KlnwVV2mM-ZlE8VSTKh10Amkc
+
+# DolFin Fraudulent Transaction Model
+The purpose of this code is to develop a fraud detection model, which monitors a number of features to determine if a transaction is fraud. As the DolFin team receive more data they will be able to combine it with this dataset or add/remove features from the provided one.
+
+The code has been designed to run off Google Colab, be sure to ensure that the dataset directory is updated and the file is located there.
+
+## Import relevent libaries
+"""
+
+# Import utilities
+import numpy as np
+import pandas as pd
+import datetime
+import os
+import warnings
+import matplotlib.pyplot as plt
+import seaborn as sns
+warnings.filterwarnings('ignore')
+
+# Deep Learning
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.utils import to_categorical
+from sklearn.preprocessing import LabelEncoder
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers, Model, regularizers
+from sklearn.preprocessing import StandardScaler
+
+# Import Google Drive to access the file
+from google.colab import drive
+drive.mount('/content/drive')
+
+# Color scheme specified by AI Team Lead
+colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be']
+
+"""## Importing the Dataset"""
+
+# Specify the dataset location
+# Dataset from https://www.kaggle.com/datasets/sauravmishraa/frauddataset
+dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Fraud/'
+file_name = 'CreditCardData.csv'
+
+# Read the dataset and store it in a dataframe
+df = pd.read_csv(dataset_dir + file_name)
+
+# Shuffle the data
+df.sample(frac=1).reset_index(drop=True)
+
+# Display a list of column names
+labels = list(df.columns)
+
+# Determine the number of samples and features
+num_samples = df.shape[0]
+num_features = df.shape[1]
+
+# Display this information in the terminal
+msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n'
+msg += f'The column names are: {labels}'
+print(msg)
+
+"""## Feature Conditioning
+This is to ensure that the currency sign is removed, categorical columns converted to integers, ext...
+"""
+
+# Ensure the Amount dataframe is a string datatype
+df['Amount'] = df['Amount'].astype('string')
+def remove_currency(text):
+    if isinstance(text, str):
+        return text.replace('£','')
+    else:
+        return text
+
+# Apply the remove currency transformation
+df['Amount'] = df['Amount'].apply(remove_currency)
+
+# Specify the categorical columns
+categorical_columns = ['Day of Week', 'Type of Card', 'Entry Mode',
+                       'Type of Transaction', 'Merchant Group',
+                       'Country of Transaction', 'Country of Residence',
+                       'Gender', 'Bank']
+
+# Apply the categorical transformation
+df[categorical_columns] = df[categorical_columns].astype('category')
+
+# Replace any missing values with 0
+df['Amount'].fillna(0, inplace=True)
+
+# Convert the numerical columns to float 32
+numerical_columns = ['Amount', 'Age']
+
+# Apply the float32 transformation
+df[numerical_columns] = df[numerical_columns].astype('float32')
+
+# Remove the features that will hinder the DNN or won't be avalible
+remove_columns = ['Date', 'Time', 'Transaction ID', 'Shipping Address']
+df = df.drop(columns=remove_columns, axis=1)
+
+# Convert the categorical features to one-hot encoding
+df = pd.get_dummies(df, columns=categorical_columns)
+
+# Apply scaling to the numerical features of the dataframe
+scalar = StandardScaler()
+df[numerical_columns] = scalar.fit_transform(df[numerical_columns])
+
+# Ensure the numerical columns are floats
+df[numerical_columns] = df[numerical_columns].astype('float')
+
+# Ensure that the Fraud label is the last column
+df['Fraud'] = df.pop('Fraud')
+
+"""## Train / Test and Split the Data
+To increase training size and limit errors, I ensured the datasets were in the TensorFlow dataset format.
+"""
+
+# Convert the values to int, I had to do this to resolve an error
+df = df.astype(int)
+
+# Split the dataset into train and test.
+test_split = 0.2
+train_ds, test_ds = train_test_split(df, test_size=test_split, random_state=42)
+
+# Convert to numpy array
+train_ds = train_ds.to_numpy()
+test_ds = test_ds.to_numpy()
+
+# Split into features and labels
+x_train = train_ds[:, :-1]
+y_train = train_ds[:, -1]
+x_test = test_ds[:, :-1]
+y_test = test_ds[:, -1]
+
+# Convert into tensorflow datasets
+train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+# Shuffle the datasets and batch
+batch_size = 32
+train_ds = train_ds.shuffle(buffer_size=len(train_ds)).batch(batch_size)
+test_ds = test_ds.shuffle(buffer_size=len(test_ds)).batch(batch_size)
+
+# Prefetch the train and test datasets
+train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+test_ds = test_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+"""## Data Exploration"""
+
+# Visualisation of dataset
+def describe_labels(y_train, y_test):
+  fig, ax = plt.subplots(1, 2, figsize=(16,6))
+#   colors = sns.color_palette('deep')
+  colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be']
+
+  train_unique_labels = np.unique(y_train, return_counts=True)
+  train_labels = train_unique_labels[0]
+  train_label_count = train_unique_labels[1]
+  ax[0].bar(x=train_labels, height=train_label_count, color=colors)
+  ax[0].set_title('Class Count')
+
+  ax[1].pie(x=train_label_count, labels=train_labels, colors=colors, pctdistance=0.8, autopct='%1.1f%%')
+  ax[1].set_title('Class Distribution')
+
+  plt.colorbar
+  plt.tight_layout()
+  plt.show()
+  return None
+
+describe_labels(y_train, y_test)
+
+"""## Model Development
+A deep neural net was developed, and then regulation was added to improve model generalability.
+"""
+
+# Develop a function to build and compile the model
+def build_model(num_features, num_classes):
+
+    model = keras.Sequential([
+        layers.Dense(128, activation='relu', input_shape=(num_features,),
+                    kernel_regularizer=regularizers.l2(0.01)),
+        layers.Dropout(0.2),
+        layers.Dense(64, activation='relu',
+                    kernel_regularizer=regularizers.l2(0.01)),
+        layers.Dropout(0.2),  # Consistent dropout rate
+        layers.Dense(1, activation='sigmoid')
+    ])
+
+    # Compile the model
+    model.compile(loss='binary_crossentropy', optimizer='adam',
+                  metrics=['accuracy'])
+
+    return model
+
+# Create the input arguments for the model
+num_features = df.shape[1]
+num_classes = 2
+
+# Build the model for use
+model = build_model(num_features-1, num_classes)
+model.summary()
+
+# Plot the model architecture
+keras.utils.plot_model(model)
+
+"""## Model Testing
+Tensorboard was used to record the training runs.
+"""
+
+# Commented out IPython magic to ensure Python compatibility.
+# Create a tensorboard callback
+log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
+
+# Create a early stopping callback to prevent overfitting
+early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=1)
+
+# Store the callbacks in a list
+tf_callbacks = [tensorboard_callback, early_stopping]
+
+# Display tensorboard
+# %load_ext tensorboard
+# %tensorboard --logdir logs
+
+# Specify the maximum number of epochs
+num_epochs=10
+
+# Fit the model to the training data
+history = model.fit(train_ds, epochs=num_epochs, batch_size=batch_size
+                    , callbacks=tf_callbacks)
+
+# Save a copy of the model
+model.save_weights('fraud_model.h5')
+
+print(f"Type of x_train: {type(x_train)}")
+print(f"Type of y_train: {type(y_train)}")
+
+"""## Model Validation
+The model performed well due to the large amount of training data and only trained for a few epoch before early stopping kicked in.
+"""
+
+# Create a function to display the training accuracy and loss
+def plt_accuracy_loss(history):
+    # Plot the training history
+    accuracy = history.history['accuracy']
+    loss = history.history['loss']
+    epochs = range(len(accuracy))
+
+    figure, ax = plt.subplots(2, 1, figsize=(12, 8))
+
+    colors = sns.color_palette("crest", n_colors=2)
+
+    ax[0].plot(epochs, accuracy, '-o', color=colors[0])
+    ax[0].set_title('Training Accuracy')
+    ax[0].set_xlabel('Epochs')
+    ax[0].set_ylabel('Accuracy [%]')
+
+    ax[1].plot(epochs, loss, '-o', color=colors[1])
+    ax[1].set_title('Training Loss')
+    ax[1].set_xlabel('Epochs')
+    ax[1].set_ylabel('Loss [%]')
+
+    plt.tight_layout()
+    plt.show()
+
+plt_accuracy_loss(history)
+
+def confusion_matrix_heatmap(y_true, y_pred, labels, name):
+    # Compute the confusion matrix;
+    num_classes = 2
+    conf_matrix = tf.math.confusion_matrix(labels=y_true, predictions=y_pred,
+                                           num_classes=num_classes)
+
+    # Plotting the confusion matrix using seaborn
+    plt.figure(figsize=(12, 6))
+    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap=colors,
+                xticklabels=labels,
+                yticklabels=labels)
+    plt.xlabel('Predicted Labels')
+    plt.ylabel('True Labels')
+    plt.title(f'{name} Confusion Matrix')
+    plt.show()
+
+# Extract true labels from the dataset
+name_2 = 'DolFin Fraudulant Transaction Detection'
+
+# Predict using the model
+y_pred = model.predict(test_ds)
+y_pred = np.argmax(y_pred, axis=1)
+labels = ['Normal Transaction','Fraudulant Transaction']
+
+confusion_matrix_heatmap(y_test, y_pred, labels, name_2)
+
+# Calculate the model accuracy
+loss, accuracy = model.evaluate(x_test, y_test)
+print(f'Test Accuracy is {accuracy:.2f}')