diff --git a/neo_dolfin/ai/BERT Transaction Classification/AccLossChart.png b/neo_dolfin/ai/BERT Transaction Classification/AccLossChart.png new file mode 100644 index 00000000..0a730a2c Binary files /dev/null and b/neo_dolfin/ai/BERT Transaction Classification/AccLossChart.png differ diff --git a/neo_dolfin/ai/BERT Transaction Classification/BERT_Transaction_Classification.pdf b/neo_dolfin/ai/BERT Transaction Classification/BERT_Transaction_Classification.pdf new file mode 100644 index 00000000..53995b40 Binary files /dev/null and b/neo_dolfin/ai/BERT Transaction Classification/BERT_Transaction_Classification.pdf differ diff --git a/neo_dolfin/ai/BERT Transaction Classification/bert_transaction_classification.py b/neo_dolfin/ai/BERT Transaction Classification/bert_transaction_classification.py new file mode 100644 index 00000000..9f4313ff --- /dev/null +++ b/neo_dolfin/ai/BERT Transaction Classification/bert_transaction_classification.py @@ -0,0 +1,280 @@ +# -*- coding: utf-8 -*- +"""BERT_Transaction_Classification.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/12xvrn9rszCSzKdRl99XX6Q6SHU6gZWPb + +# DolFin Transaction Classification +The purpose of this python notebook is to develop a NLP model that reads the transaction description and then classify the transaction into a category. The reasoning behind using BERT is because it is the current state of the art NLP model avalible and due to our lack of training data, transfer learning would be a practical solution. + +# Libaries Requried +The following libaries are required for getting the model operational. +""" + +# Download and install the requied non-standard libaries required for the model +!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py +!pip install tokenization +!pip install transformers +!pip install tensorflow-hub + +# Processing Utilities +import datetime +import os +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from tensorflow.keras.utils import to_categorical +# from sklearn.preprocessing import LabelEncoder +from sklearn import preprocessing +import tokenization +from transformers import BertTokenizer + +# Natural Language Processing +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from transformers import BertTokenizer, TFBertForSequenceClassification +import nltk +nltk.download('punkt') +nltk.download('stopwords') + +# Deep learning +import tensorflow as tf +import tensorflow_hub as hub +from tensorflow import keras +from keras import Model, layers + +# Plotting utillities +import matplotlib.pyplot as plt +import seaborn as sns + +# Mount Google Drive +from google.colab import drive +drive.mount('/content/drive') + +colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be'] +sns.set_palette(sns.color_palette(colors)) + +"""# Data Labeling +The data is labeled into categories based on a keyword search. Using nested for loops, we itterate through each transaction description field and if a keyword matches then it will assign a class in the label feature. +The code can be modified into more classes or have more keywords added easily through ammeding the list, or including additional lists for new classes. +""" + +# Data location, batch and image size +dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Transaction_Classification/' +file_name = 'transaction_ut.csv' + +# Load the dataset into a dataframe +df = pd.read_csv(dataset_dir + file_name) + +# Shuffle the dataframe +df = df.sample(frac=1).reset_index(drop=True) + +# Store the column names +labels = list(df.columns) + +# Determine the shape of the data +num_samples = df.shape[0] +num_features = df.shape[1] + +# Use keyword search to assign a class for each of the transactions +# Home +keyword_home = ['energy', 'water', 'telecommunications', 'electricity', 'morgage', 'loan'] + +# Personal +keyword_personal = ['payroll', 'wage', 'wages', 'children', 'fitness', 'education', 'health'] + +# Good Life +keyword_goodlife = ['atm', 'tran', 'transaction', 'fee', 'holiday', 'travel', 'takeaway'] + +# Transport +keyword_transport = ['insurance', 'car', 'parking', 'taxi', 'uber', 'toll', 'fuel', 'bike'] + + +labels = ['home', 'personal','goodlife','transport'] + +keyword_list = [keyword_home, keyword_personal, keyword_goodlife, keyword_transport] + +# Apply the transformation to the dataset +for label, keyword in zip(labels, keyword_list): + df['class'] = df.apply(lambda row: label if any(str(keyword).lower() in str(row['description']).lower() for keyword in keyword) else row['class'], axis=1) + +# Display the unique values +df['class'].unique() + +"""# Tokenization +Not all versions of BERT are compatable with TensorFlow, after trial and error it was found that some third party libaries with BERT are not supported by TensorFlow. The version of BERT that is compatable can be loaded from TensorHub, we then assign this model to the bert keras layer. +A specific tokenizer needs to be used with the BERT model, for the tokenizer we specify the vocab file and the word casing, then initalise the tokenizer. +""" + +# Download the bert model from TensorFlow Hub +model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2' +# Create a keras layer with the model and set trainable to False +bert_layer = hub.KerasLayer(model_url, trainable=False) + +# specify the vocab file for the bert model tokenizer +vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() + +# Specify for the tokenizer to change text to lower case +tokenizer_case = bert_layer.resolved_object.do_lower_case.numpy() +# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) + +# Create a tokenizer instance +tokenizer = BertTokenizer(vocab_file, tokenizer_case) + +"""# Preprocessing +The NLP model can not interperate plain text, it does however understand numbers which is done through tokenization. The function written below will convert a string of words into a list containing each individual word as an entry in the list, and then converted into a number representing each word. The list for each entry is then padded, where entris of zeros are added to ensure each list is the same length. +""" + +y + +# Create a function that encodes the text as tokens for the BERT model +def bert_encode(texts, tokenizer, max_len=512): + # Initialise the token, mask and segment lists + all_tokens = [] + all_masks = [] + all_segments = [] + + # Itterate through each of the transaction descriptions + for text in texts: + # Replace the entry with tokenized version of the entry + text = tokenizer.tokenize(text) + + # Entries are trunkated to make room for two special tokens + text = text[:max_len-2] + # [CLS] is added to the start of the sequence and [SEP] to the end + input_sequence = ["[CLS]"] + text + ["[SEP]"] + # Pad the sequence, to ensure that all sequences are the same length + pad_len = max_len-len(input_sequence) + + # BERT uses segment IDs to distinguish sentences, just set these to 0 + tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len + pad_masks = [1] * len(input_sequence) + [0] * pad_len + segment_ids = [0] * max_len + + # append the tokens, masks and segment IDs into the respective lists + all_tokens.append(tokens) + all_masks.append(pad_masks) + all_segments.append(segment_ids) + + # Return the output of the function + return np.array(all_tokens), np.array(all_masks), np.array(all_segments) + +# Encode the training data using the preprocessing function above +max_len = 250 +x = bert_encode(df['description'], tokenizer, max_len=max_len) +y = df['class'] +y = pd.get_dummies(train_labels) + +# Split the data into training and testing +# x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2, random_state=42) + +# # Visualisation of dataset +# def describe_labels(y_train, y_test): +# fig, ax = plt.subplots(1, 2, figsize=(16,6)) +# # colors = sns.color_palette('deep') +# colors = ['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be'] + +# train_unique_labels = np.unique(y_train, return_counts=True) +# train_labels = train_unique_labels[0] +# train_label_count = train_unique_labels[1] +# ax[0].bar(x=train_labels, height=train_label_count, color=colors) +# ax[0].set_title('Class Count') + +# ax[1].pie(x=train_label_count, labels=train_labels, colors=colors, pctdistance=0.8, autopct='%1.1f%%') +# ax[1].set_title('Class Distribution') + +# plt.colorbar +# plt.tight_layout() +# plt.show() +# return None + +# describe_labels(y_train, y_test) + +# Build a function for generating the BERT classification model +def build_model(bert_layer, max_len=512): + # The input layers correespond to inputs required by this version of BERT + input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") + input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask") + segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids") + + # The pooled output is the BERT model layer which takes the three input layers + pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids]) + + # The classifier output layer is used to extract the first token from the sequence + # which is expected to be the [CLS] token added prior + classifier_output = sequence_output[:, 0, :] + + # Create some dense layers for the classification portion of the model + # Dropout layers are used to make the model more generalised. + layer = tf.keras.layers.Dense(64, activation='relu')(classifier_output) + layer = tf.keras.layers.Dropout(0.2)(layer) + layer = tf.keras.layers.Dense(32, activation='relu')(layer) + layer = tf.keras.layers.Dropout(0.2)(layer) + output_layer = tf.keras.layers.Dense(6, activation='softmax')(layer) + + # Build the model based on the code we developed above and compile + model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output_layer) + model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy']) + + # Return the model + return model + +# Create a model instance for our transaction classification +model = build_model(bert_layer, max_len=max_len) +# Display a summary of the model +model.summary() + +"""# Training Model +Training the BERT model on the DolFin classification data is the same process as with standard TensorFlow classification models. I have included checkpointing and earlystopping into the callbacks to ensure that we have the best model and that it does not overfit the training data. +""" + +# Create our tensorboard callbacks +checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1) +earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2, verbose=1) +tensorflow_callbacks = [checkpoint, earlystopping] + +# Specify the number of epoch, batch size and validation split +num_epochs = 10 +batch_size = 32 +val_split = 0.2 + +# Train the NLP model on the data and store its performance +history = model.fit( + train_input, train_labels, + validation_split=val_split, + epochs=num_epochs, + callbacks=tensorflow_callbacks, + batch_size=batch_size, + verbose=1 +) + +# Save a copy of the model +model.save_weights('BERT_model.h5') + +# Create a function to display the training accuracy and loss +def plt_accuracy_loss(history): + # Plot the training history + accuracy = history.history['accuracy'] + loss = history.history['loss'] + epochs = range(len(accuracy)) + + figure, ax = plt.subplots(2, 1, figsize=(12, 8)) + + ax[0].plot(epochs, accuracy, '-o', color=colors[0]) + ax[0].set_title('Training Accuracy') + ax[0].set_xlabel('Epochs') + ax[0].set_ylabel('Accuracy [%]') + + ax[1].plot(epochs, loss, '-o', color=colors[1]) + ax[1].set_title('Training Loss') + ax[1].set_xlabel('Epochs') + ax[1].set_ylabel('Loss [%]') + + plt.tight_layout() + plt.show() + +plt_accuracy_loss(history) \ No newline at end of file