Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# -*- coding: utf-8 -*-
"""DolFin_Transaction_GAN.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1VDTyx4ACqsM7hEp_e9Nn8xXGA6q2-oAC

# DolFin GAN for Transaction Data Generation
The prupose of this notebook was to generate additional transaction descriptions to train the NLP models on, to further improve the model that are replacing the BASIq API. A lot of time has gone into researching how to properly execute a GAN for NLP, and there were very limited examples online. The final outcome was not what I would deem successful, I would not recommend continuing to contribute to this code if there is a remote possibility that the DolFin team will be getting more transaction data. There is a small section that used generative AI to do the training loop for the GAN, but this will be indicated.

## Import Libaries
"""

# Processing Utilities
import datetime
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Natural Language Processing
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, TFBertForSequenceClassification
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Deep learning
import tensorflow as tf
from tensorflow import keras
from keras import Model, layers

# Plotting utillities
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

"""## Import Data
Google Colab was used for this code, you will need to use a GPU configured in the runtime, the file will need to be in your target directory and you will need to change dataset_dir.
"""

# Data location, batch and image size
dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Transaction_Classification/'
file_name = 'transaction_ut.csv'

# Load the dataset into a dataframe
df = pd.read_csv(dataset_dir + file_name)

# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# Store the column names
labels = list(df.columns)

# Determine the shape of the data
num_samples = df.shape[0]
num_features = df.shape[1]

msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n'
msg += f'The column names are: {labels}'
print(msg)

"""## Stop Word & Tokenisation
Stop words are joining words that hold no meaning, we remove these because it is the equivilent of removing bad features. Tokenisation is used to convert a string of words into a list of words, where each entry into the list is an individual word.
"""

# Create a stop words variable
stop_words = set(stopwords.words('english'))

# Preprocess the text in the content field
def preprocess_content(text):
# Covert the text to lower case
tokens = word_tokenize(text.lower())
# Remove stop words from the text and ensure each token is seproate by a space
result = ' '.join([word for word in tokens if word.isalpha() and word not in stop_words])
return result


# Apply the preprocessing transformation
df['processed_text'] = df['description'].apply(preprocess_content)

# Tokenisation of the processed_text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['description'])
sequences = tokenizer.texts_to_sequences(df['description'])
real_data = pad_sequences(sequences, maxlen=200)

"""## Model Development
The generator is a LSTM neural network, I tried a few different output layers and had a dense layer to generate the sequence after some research I found that a time distributed layer needed to be used because it is designed to handle sequence generation. The discriminator is just a feed forward LSTM neural network, this is because it has a single LSTM layer and a dense layer, this is the easiest implementation of a discriminator. To improve the discrimintator you could try implementing transfer learning with a model such as BERT. With a better discrimintator it will profide better feedback to the generator to make a better sequence. The GAN model was built by using code developed by Bhavya Kaushik as a general reference https://bhavyakaushik.medium.com/heres-how-you-can-build-your-own-generative-ai-project-a-step-by-step-guide-0e67715c7caa
"""

# Parameters for the generator and discriminator model
latent_dim = 100
vocab_size = 5000
max_sequence_length = 200

# Create a Generator model
def build_generator(latent_dim, vocab_size, sequence_length):
model = tf.keras.Sequential([
layers.Input(shape=(latent_dim,)),
layers.Dense(sequence_length * 128),
layers.Reshape((sequence_length, 128)),
layers.LSTM(256, return_sequences=True),
layers.Dropout(0.5),
layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax'))
])
return model

generator = build_generator(latent_dim, vocab_size, max_sequence_length)

# Create a Discriminator model
def build_discriminator(sequence_length, vocab_size):
model = tf.keras.Sequential([
layers.Input(shape=(sequence_length, vocab_size)),
layers.LSTM(128),
layers.Dense(1, activation='sigmoid')
])
return model

discriminator = build_discriminator(max_sequence_length, vocab_size)
discriminator.compile(loss='binary_crossentropy', optimizer=optimizer_disc)
optimizer_disc = tf.keras.optimizers.Adam(0.0001)

# Create a GAN model
def build_gan(generator, discriminator, latent_dim):
discriminator.trainable = False
gan_input = layers.Input(shape=(latent_dim,))
generated_sequence = generator(gan_input)
gan_output = discriminator(generated_sequence)
gan = Model(gan_input, gan_output)
return gan

gan = build_gan(generator, discriminator, latent_dim)
gan.compile(loss='binary_crossentropy', optimizer=optimizer_gen)

"""## Training Loop
Code developed by a LLM was used to troubleshoot issues in this training loop OpenAI. (2024). ChatGPT. https://www.openai.com/. When I originally wrote the code I would randomly select real vs generated values and then train the discriminator. The model was able to generate some sequences but now is not working after making some changes.
"""

# Compile models with separate optimizers
optimizer_gen = tf.keras.optimizers.Adam(0.0002)

epochs = 100
batch_size = 32

for epoch in range(epochs):
for batch in range(real_data.shape[0] // batch_size):
noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
generated_sequences = generator.predict(noise)

real_data_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)]
real_data_samples = tf.keras.utils.to_categorical(real_data_samples, num_classes=vocab_size)

labels_real = np.ones((batch_size, 1))
labels_fake = np.zeros((batch_size, 1))

d_loss_real = discriminator.train_on_batch(real_data_samples, labels_real)
d_loss_fake = discriminator.train_on_batch(generated_sequences, labels_fake)

d_loss = 0.5 * (d_loss_real + d_loss_fake)
g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

if batch % 10 == 0:
print(f"Epoch: {epoch}, Batch: {batch}, D Loss: {d_loss}, G Loss: {g_loss}")

def generate_predicted_sequences(generator, num_samples=10, latent_dim=100):
noise = np.random.normal(0, 1, (num_samples, latent_dim))
predicted_sequences = generator.predict(noise)
return predicted_sequences

def decode_sequences(tokenizer, sequences):
decoded_texts = []
for sequence in sequences:
predicted_words = np.argmax(sequence, axis=1)
decoded_text = ' '.join([tokenizer.index_word[i] if i in tokenizer.index_word else '' for i in predicted_words])
decoded_texts.append(decoded_text)
return decoded_texts

predicted_sequences = generate_predicted_sequences(generator, num_samples=5, latent_dim=100)
decoded_texts = decode_sequences(tokenizer, predicted_sequences)

for i, text in enumerate(decoded_texts, 1):
print(f"Generated Text {i}: {text}")