Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
26f5cc7
Adding multi class classification and tuning for dense and dropout la…
nsriharshavardhan Apr 14, 2024
5fb9cc6
Trying various LLM
nsriharshavardhan Apr 28, 2024
90c38db
Merge branch 'DataBytes-Organisation:develop' into develop
nsriharshavardhan Apr 30, 2024
6473d05
BERT multiclass sentiment analysis on a new financial reviews dataset.
nsriharshavardhan Apr 30, 2024
40e1ca3
Previous LSTM and BERT models cleaned and documented
nsriharshavardhan Apr 30, 2024
188729a
Previous LSTM and BERT models in jupyter notebook changed into python…
nsriharshavardhan Apr 30, 2024
fb60aab
Delete neo_dolfin/ai/chatbot/sample_llm.ipynb
nsriharshavardhan Apr 30, 2024
4d69955
Merge branch 'DataBytes-Organisation:develop' into develop
nsriharshavardhan May 7, 2024
186ca46
Merge branch 'DataBytes-Organisation:develop' into develop
nsriharshavardhan May 11, 2024
2c21625
Merge branch 'DataBytes-Organisation:develop' into develop
nsriharshavardhan May 11, 2024
813a994
Generated fake feedback form data using mockaroo.
nsriharshavardhan May 11, 2024
48bd90f
Adding feedback form visualizations
nsriharshavardhan May 11, 2024
a75a9d9
Simplified random forest model for the account balance prediction.
nsriharshavardhan May 11, 2024
db2bf07
Standardized feedback form data.
nsriharshavardhan May 11, 2024
85977d5
Adding actual vs prediction comparision.
nsriharshavardhan May 11, 2024
6fc8227
Updating the route to feedback data
nsriharshavardhan May 11, 2024
f91324a
Merge branch 'develop' into nsriharshavardhan-feedback
nsriharshavardhan May 17, 2024
f47d187
Updated model with changes made to imports and evaluation
nsriharshavardhan May 17, 2024
a751b22
Updates to feedback form and dashboard
nsriharshavardhan May 17, 2024
d66fadd
Creating directory for feedback dashboard sankey graph.
nsriharshavardhan May 17, 2024
1965ad4
Adding sankey chart and word cloud visualizations for feedback dashboard
nsriharshavardhan May 17, 2024
b6bdedf
Adding feedback form to the footer
nsriharshavardhan May 17, 2024
c1f428c
Adding feedback dashboard to the navbar.
nsriharshavardhan May 17, 2024
5568371
Merge branch 'DataBytes-Organisation:develop' into nsriharshavardhan-…
nsriharshavardhan May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions neo_dolfin/ai/generated_data/feedback_data.csv

Large diffs are not rendered by default.

70 changes: 70 additions & 0 deletions neo_dolfin/ai/savings/Simplified_RF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import matplotlib.pyplot as plt

# Function to read and process transaction data
def read_process_transactions(file_name):
transactions = pd.read_csv(file_name + '.csv')
transactions['postDate'] = pd.to_datetime(transactions['Date'])
transactions.sort_values(by='postDate', inplace=True)
transactions['total_balance'] = transactions['balance'].cumsum()
return transactions[['postDate', 'total_balance']]

# Function to create features for the Random Forest model
def create_features(data):
data['week'] = data['postDate'].dt.dayofweek
data['quarter'] = data['postDate'].dt.quarter
data['month'] = data['postDate'].dt.month
data['year'] = data['postDate'].dt.year
data['dayofyear'] = data['postDate'].dt.dayofyear
data['dayofmonth'] = data['postDate'].dt.day
data['dayofweek'] = data['postDate'].dt.dayofweek
return data

# Read and process the data
data = read_process_transactions('../ai/generated_data/fake_bank_account_transactions2')
data = create_features(data)
data.set_index('postDate', inplace=True)

# Prepare the data for the model
FEATURES = ['week', 'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'dayofweek']
TARGET = 'total_balance'
X = data[FEATURES]
y = data[TARGET]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model to a file
joblib.dump(model, 'random_forest_model.joblib')

y_pred = model.predict(X_test)

# Plotting the actual vs predicted values
plt.figure(figsize=(10, 5))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted', linestyle='--')
plt.title('Test Data vs Predicted Data')
plt.xlabel('Date')
plt.ylabel('Total Balance')
plt.legend()
plt.show()

# Plotting function with the new color scheme
def plot_feature_importances(model, features):
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices], color=['#343a40', '#0077bb', '#d9d9d9', '#fafafe', '#abb5be'])
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Feature Importance')
plt.show()

# Plot the feature importances
plot_feature_importances(model, FEATURES)
150 changes: 150 additions & 0 deletions neo_dolfin/ai/sentiment_analysis/BERT_multiclass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""Untitled6.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1mRG9RhnH8GNbDah4xHBCRAVAplv__7nR
"""

import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
import matplotlib.pyplot as plt
import pandas as pd

# Load the BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Prepare the dataset
df = pd.read_csv('data.csv')
df['Sentiment'] = df['Sentiment'].map({'positive': 2, 'negative': 0, 'neutral': 1})
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Sentence'], df['Sentiment'], test_size=0.2)

# Tokenize the dataset
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Create a PyTorch Dataset
class SentimentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)

train_dataset = SentimentDataset(train_encodings, list(train_labels))
test_dataset = SentimentDataset(test_encodings, list(test_labels))

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Variables to keep track of losses and accuracies for each epoch
train_losses = []
test_accuracies = []

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
for epoch in range(3):
total_loss = 0
model.train()
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_train_loss = total_loss / len(train_loader)
train_losses.append(avg_train_loss)

# Evaluation for each epoch
model.eval()
total_predictions = []
total_actual = []
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
total_predictions.extend(predictions.cpu().numpy())
total_actual.extend(labels.cpu().numpy())
accuracy = np.sum(np.array(total_predictions) == np.array(total_actual)) / len(total_actual)
test_accuracies.append(accuracy)

print(f"Epoch: {epoch}, Loss: {avg_train_loss}, Accuracy: {accuracy}")

# Plot training loss over time
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.legend()

# Plot test accuracy over time
plt.subplot(1, 2, 2)
plt.plot(test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy Over Time')
plt.legend()

plt.tight_layout()
plt.show()

# Save the model
torch.save(model.state_dict(), 'best_model.pt')

# Calculate sentiment counts in training and testing data
train_counts = train_labels.value_counts()
test_counts = test_labels.value_counts()

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot training data distribution
axs[0].bar(train_counts.index, train_counts.values)
axs[0].set_xticks([0, 1, 2])
axs[0].set_xticklabels(['negative', 'neutral', 'positive'])
axs[0].set_title('Training Data Distribution')
axs[0].set_xlabel('Sentiment')
axs[0].set_ylabel('Count')

# Plot testing data distribution
axs[1].bar(test_counts.index, test_counts.values)
axs[1].set_xticks([0, 1, 2])
axs[1].set_xticklabels(['negative', 'neutral', 'positive'])
axs[1].set_title('Testing Data Distribution')
axs[1].set_xlabel('Sentiment')
axs[1].set_ylabel('Count')

# Show the plots
plt.tight_layout()
plt.show()

"""# References
https://medium.com/nerd-for-tech/fine-tuning-pretrained-bert-for-sentiment-classification-using-transformers-in-python-931ed142e37
"""
Loading