diff --git a/neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py b/neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py new file mode 100644 index 00000000..0588c293 --- /dev/null +++ b/neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +"""DolFin_Transaction_GAN.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1VDTyx4ACqsM7hEp_e9Nn8xXGA6q2-oAC + +# DolFin GAN for Transaction Data Generation +The prupose of this notebook was to generate additional transaction descriptions to train the NLP models on, to further improve the model that are replacing the BASIq API. A lot of time has gone into researching how to properly execute a GAN for NLP, and there were very limited examples online. The final outcome was not what I would deem successful, I would not recommend continuing to contribute to this code if there is a remote possibility that the DolFin team will be getting more transaction data. There is a small section that used generative AI to do the training loop for the GAN, but this will be indicated. + +## Import Libaries +""" + +# Processing Utilities +import datetime +import os +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from tensorflow.keras.utils import to_categorical +from sklearn.preprocessing import LabelEncoder + +# Natural Language Processing +import tensorflow as tf +from tensorflow.keras.models import Model +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from transformers import BertTokenizer, TFBertForSequenceClassification +import nltk +nltk.download('punkt') +nltk.download('stopwords') + +# Deep learning +import tensorflow as tf +from tensorflow import keras +from keras import Model, layers + +# Plotting utillities +import matplotlib.pyplot as plt +import seaborn as sns + +# Mount Google Drive +from google.colab import drive +drive.mount('/content/drive') + +"""## Import Data +Google Colab was used for this code, you will need to use a GPU configured in the runtime, the file will need to be in your target directory and you will need to change dataset_dir. +""" + +# Data location, batch and image size +dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Transaction_Classification/' +file_name = 'transaction_ut.csv' + +# Load the dataset into a dataframe +df = pd.read_csv(dataset_dir + file_name) + +# Shuffle the dataframe +df = df.sample(frac=1).reset_index(drop=True) + +# Store the column names +labels = list(df.columns) + +# Determine the shape of the data +num_samples = df.shape[0] +num_features = df.shape[1] + +msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n' +msg += f'The column names are: {labels}' +print(msg) + +"""## Stop Word & Tokenisation +Stop words are joining words that hold no meaning, we remove these because it is the equivilent of removing bad features. Tokenisation is used to convert a string of words into a list of words, where each entry into the list is an individual word. +""" + +# Create a stop words variable +stop_words = set(stopwords.words('english')) + +# Preprocess the text in the content field +def preprocess_content(text): + # Covert the text to lower case + tokens = word_tokenize(text.lower()) + # Remove stop words from the text and ensure each token is seproate by a space + result = ' '.join([word for word in tokens if word.isalpha() and word not in stop_words]) + return result + + +# Apply the preprocessing transformation +df['processed_text'] = df['description'].apply(preprocess_content) + +# Tokenisation of the processed_text +tokenizer = Tokenizer(num_words=5000) +tokenizer.fit_on_texts(df['description']) +sequences = tokenizer.texts_to_sequences(df['description']) +real_data = pad_sequences(sequences, maxlen=200) + +"""## Model Development +The generator is a LSTM neural network, I tried a few different output layers and had a dense layer to generate the sequence after some research I found that a time distributed layer needed to be used because it is designed to handle sequence generation. The discriminator is just a feed forward LSTM neural network, this is because it has a single LSTM layer and a dense layer, this is the easiest implementation of a discriminator. To improve the discrimintator you could try implementing transfer learning with a model such as BERT. With a better discrimintator it will profide better feedback to the generator to make a better sequence. The GAN model was built by using code developed by Bhavya Kaushik as a general reference https://bhavyakaushik.medium.com/heres-how-you-can-build-your-own-generative-ai-project-a-step-by-step-guide-0e67715c7caa +""" + +# Parameters for the generator and discriminator model +latent_dim = 100 +vocab_size = 5000 +max_sequence_length = 200 + +# Create a Generator model +def build_generator(latent_dim, vocab_size, sequence_length): + model = tf.keras.Sequential([ + layers.Input(shape=(latent_dim,)), + layers.Dense(sequence_length * 128), + layers.Reshape((sequence_length, 128)), + layers.LSTM(256, return_sequences=True), + layers.Dropout(0.5), + layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax')) + ]) + return model + +generator = build_generator(latent_dim, vocab_size, max_sequence_length) + +# Create a Discriminator model +def build_discriminator(sequence_length, vocab_size): + model = tf.keras.Sequential([ + layers.Input(shape=(sequence_length, vocab_size)), + layers.LSTM(128), + layers.Dense(1, activation='sigmoid') + ]) + return model + +discriminator = build_discriminator(max_sequence_length, vocab_size) +discriminator.compile(loss='binary_crossentropy', optimizer=optimizer_disc) +optimizer_disc = tf.keras.optimizers.Adam(0.0001) + +# Create a GAN model +def build_gan(generator, discriminator, latent_dim): + discriminator.trainable = False + gan_input = layers.Input(shape=(latent_dim,)) + generated_sequence = generator(gan_input) + gan_output = discriminator(generated_sequence) + gan = Model(gan_input, gan_output) + return gan + +gan = build_gan(generator, discriminator, latent_dim) +gan.compile(loss='binary_crossentropy', optimizer=optimizer_gen) + +"""## Training Loop +Code developed by a LLM was used to troubleshoot issues in this training loop OpenAI. (2024). ChatGPT. https://www.openai.com/. When I originally wrote the code I would randomly select real vs generated values and then train the discriminator. The model was able to generate some sequences but now is not working after making some changes. +""" + +# Compile models with separate optimizers +optimizer_gen = tf.keras.optimizers.Adam(0.0002) + +epochs = 100 +batch_size = 32 + +for epoch in range(epochs): + for batch in range(real_data.shape[0] // batch_size): + noise = np.random.normal(0, 1, size=(batch_size, latent_dim)) + generated_sequences = generator.predict(noise) + + real_data_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + real_data_samples = tf.keras.utils.to_categorical(real_data_samples, num_classes=vocab_size) + + labels_real = np.ones((batch_size, 1)) + labels_fake = np.zeros((batch_size, 1)) + + d_loss_real = discriminator.train_on_batch(real_data_samples, labels_real) + d_loss_fake = discriminator.train_on_batch(generated_sequences, labels_fake) + + d_loss = 0.5 * (d_loss_real + d_loss_fake) + g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1))) + + if batch % 10 == 0: + print(f"Epoch: {epoch}, Batch: {batch}, D Loss: {d_loss}, G Loss: {g_loss}") + +def generate_predicted_sequences(generator, num_samples=10, latent_dim=100): + noise = np.random.normal(0, 1, (num_samples, latent_dim)) + predicted_sequences = generator.predict(noise) + return predicted_sequences + +def decode_sequences(tokenizer, sequences): + decoded_texts = [] + for sequence in sequences: + predicted_words = np.argmax(sequence, axis=1) + decoded_text = ' '.join([tokenizer.index_word[i] if i in tokenizer.index_word else '' for i in predicted_words]) + decoded_texts.append(decoded_text) + return decoded_texts + +predicted_sequences = generate_predicted_sequences(generator, num_samples=5, latent_dim=100) +decoded_texts = decode_sequences(tokenizer, predicted_sequences) + +for i, text in enumerate(decoded_texts, 1): + print(f"Generated Text {i}: {text}") \ No newline at end of file