DataBytes-Organisation · sentientyougert · May 14, 2024
diff --git a/neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py b/neo_dolfin/ai/Transaction_Descrition_GAN/dolfin_transaction_gan.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+"""DolFin_Transaction_GAN.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1VDTyx4ACqsM7hEp_e9Nn8xXGA6q2-oAC
+
+# DolFin GAN for Transaction Data Generation
+The prupose of this notebook was to generate additional transaction descriptions to train the NLP models on, to further improve the model that are replacing the BASIq API. A lot of time has gone into researching how to properly execute a GAN for NLP, and there were very limited examples online. The final outcome was not what I would deem successful, I would not recommend continuing to contribute to this code if there is a remote possibility that the DolFin team will be getting more transaction data. There is a small section that used generative AI to do the training loop for the GAN, but this will be indicated.
+
+## Import Libaries
+"""
+
+# Processing Utilities
+import datetime
+import os
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.utils import to_categorical
+from sklearn.preprocessing import LabelEncoder
+
+# Natural Language Processing
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from transformers import BertTokenizer, TFBertForSequenceClassification
+import nltk
+nltk.download('punkt')
+nltk.download('stopwords')
+
+# Deep learning
+import tensorflow as tf
+from tensorflow import keras
+from keras import Model, layers
+
+# Plotting utillities
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Mount Google Drive
+from google.colab import drive
+drive.mount('/content/drive')
+
+"""## Import Data
+Google Colab was used for this code, you will need to use a GPU configured in the runtime, the file will need to be in your target directory and you will need to change dataset_dir.
+"""
+
+# Data location, batch and image size
+dataset_dir = '/content/drive/MyDrive/Colab_Notebooks/Projects/DolFin_Transaction_Classification/'
+file_name = 'transaction_ut.csv'
+
+# Load the dataset into a dataframe
+df = pd.read_csv(dataset_dir + file_name)
+
+# Shuffle the dataframe
+df = df.sample(frac=1).reset_index(drop=True)
+
+# Store the column names
+labels = list(df.columns)
+
+# Determine the shape of the data
+num_samples = df.shape[0]
+num_features = df.shape[1]
+
+msg = f'The file {file_name} contains {num_features} features and {num_samples} samples \n'
+msg += f'The column names are: {labels}'
+print(msg)
+
+"""## Stop Word & Tokenisation
+Stop words are joining words that hold no meaning, we remove these because it is the equivilent of removing bad features. Tokenisation is used to convert a string of words into a list of words, where each entry into the list is an individual word.
+"""
+
+# Create a stop words variable
+stop_words = set(stopwords.words('english'))
+
+# Preprocess the text in the content field
+def preprocess_content(text):
+    # Covert the text to lower case
+    tokens = word_tokenize(text.lower())
+    # Remove stop words from the text and ensure each token is seproate by a space
+    result = ' '.join([word for word in tokens if word.isalpha() and word not in stop_words])
+    return result
+
+
+# Apply the preprocessing transformation
+df['processed_text'] = df['description'].apply(preprocess_content)
+
+# Tokenisation of the processed_text
+tokenizer = Tokenizer(num_words=5000)
+tokenizer.fit_on_texts(df['description'])
+sequences = tokenizer.texts_to_sequences(df['description'])
+real_data = pad_sequences(sequences, maxlen=200)
+
+"""## Model Development
+The generator is a LSTM neural network, I tried a few different output layers and had a dense layer to generate the sequence after some research I found that a time distributed layer needed to be used because it is designed to handle sequence generation. The discriminator is just a feed forward LSTM neural network, this is because it has a single LSTM layer and a dense layer, this is the easiest implementation of a discriminator. To improve the discrimintator you could try implementing transfer learning with a model such as BERT. With a better discrimintator it will profide better feedback to the generator to make a better sequence. The GAN model was built by using code developed by Bhavya Kaushik as a general reference https://bhavyakaushik.medium.com/heres-how-you-can-build-your-own-generative-ai-project-a-step-by-step-guide-0e67715c7caa
+"""
+
+# Parameters for the generator and discriminator model
+latent_dim = 100
+vocab_size = 5000
+max_sequence_length = 200
+
+# Create a Generator model
+def build_generator(latent_dim, vocab_size, sequence_length):
+    model = tf.keras.Sequential([
+        layers.Input(shape=(latent_dim,)),
+        layers.Dense(sequence_length * 128),
+        layers.Reshape((sequence_length, 128)),
+        layers.LSTM(256, return_sequences=True),
+        layers.Dropout(0.5),
+        layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax'))
+    ])
+    return model
+
+generator = build_generator(latent_dim, vocab_size, max_sequence_length)
+
+# Create a Discriminator model
+def build_discriminator(sequence_length, vocab_size):
+    model = tf.keras.Sequential([
+        layers.Input(shape=(sequence_length, vocab_size)),
+        layers.LSTM(128),
+        layers.Dense(1, activation='sigmoid')
+    ])
+    return model
+
+discriminator = build_discriminator(max_sequence_length, vocab_size)
+discriminator.compile(loss='binary_crossentropy', optimizer=optimizer_disc)
+optimizer_disc = tf.keras.optimizers.Adam(0.0001)
+
+# Create a GAN model
+def build_gan(generator, discriminator, latent_dim):
+    discriminator.trainable = False
+    gan_input = layers.Input(shape=(latent_dim,))
+    generated_sequence = generator(gan_input)
+    gan_output = discriminator(generated_sequence)
+    gan = Model(gan_input, gan_output)
+    return gan
+
+gan = build_gan(generator, discriminator, latent_dim)
+gan.compile(loss='binary_crossentropy', optimizer=optimizer_gen)
+
+"""## Training Loop
+Code developed by a LLM was used to troubleshoot issues in this training loop OpenAI. (2024). ChatGPT. https://www.openai.com/. When I originally wrote the code I would randomly select real vs generated values and then train the discriminator. The model was able to generate some sequences but now is not working after making some changes.
+"""
+
+# Compile models with separate optimizers
+optimizer_gen = tf.keras.optimizers.Adam(0.0002)
+
+epochs = 100
+batch_size = 32
+
+for epoch in range(epochs):
+    for batch in range(real_data.shape[0] // batch_size):
+        noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
+        generated_sequences = generator.predict(noise)
+
+        real_data_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)]
+        real_data_samples = tf.keras.utils.to_categorical(real_data_samples, num_classes=vocab_size)
+
+        labels_real = np.ones((batch_size, 1))
+        labels_fake = np.zeros((batch_size, 1))
+
+        d_loss_real = discriminator.train_on_batch(real_data_samples, labels_real)
+        d_loss_fake = discriminator.train_on_batch(generated_sequences, labels_fake)
+
+        d_loss = 0.5 * (d_loss_real + d_loss_fake)
+        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))
+
+        if batch % 10 == 0:
+            print(f"Epoch: {epoch}, Batch: {batch}, D Loss: {d_loss}, G Loss: {g_loss}")
+
+def generate_predicted_sequences(generator, num_samples=10, latent_dim=100):
+    noise = np.random.normal(0, 1, (num_samples, latent_dim))
+    predicted_sequences = generator.predict(noise)
+    return predicted_sequences
+
+def decode_sequences(tokenizer, sequences):
+    decoded_texts = []
+    for sequence in sequences:
+        predicted_words = np.argmax(sequence, axis=1)
+        decoded_text = ' '.join([tokenizer.index_word[i] if i in tokenizer.index_word else '' for i in predicted_words])
+        decoded_texts.append(decoded_text)
+    return decoded_texts
+
+predicted_sequences = generate_predicted_sequences(generator, num_samples=5, latent_dim=100)
+decoded_texts = decode_sequences(tokenizer, predicted_sequences)
+
+for i, text in enumerate(decoded_texts, 1):
+    print(f"Generated Text {i}: {text}")