Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions neo_dolfin/ai/sentiment_analysis/BERT_multiclass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
"""Untitled6.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1mRG9RhnH8GNbDah4xHBCRAVAplv__7nR
"""

import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
import matplotlib.pyplot as plt
import pandas as pd

# Load the BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Prepare the dataset
df = pd.read_csv('data.csv')
df['Sentiment'] = df['Sentiment'].map({'positive': 2, 'negative': 0, 'neutral': 1})
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Sentence'], df['Sentiment'], test_size=0.2)

# Tokenize the dataset
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Create a PyTorch Dataset
class SentimentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)

train_dataset = SentimentDataset(train_encodings, list(train_labels))
test_dataset = SentimentDataset(test_encodings, list(test_labels))

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Variables to keep track of losses and accuracies for each epoch
train_losses = []
test_accuracies = []

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
for epoch in range(3):
total_loss = 0
model.train()
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_train_loss = total_loss / len(train_loader)
train_losses.append(avg_train_loss)

# Evaluation for each epoch
model.eval()
total_predictions = []
total_actual = []
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs[0]
predictions = torch.argmax(logits, dim=-1)
total_predictions.extend(predictions.cpu().numpy())
total_actual.extend(labels.cpu().numpy())
accuracy = np.sum(np.array(total_predictions) == np.array(total_actual)) / len(total_actual)
test_accuracies.append(accuracy)

print(f"Epoch: {epoch}, Loss: {avg_train_loss}, Accuracy: {accuracy}")

# Plot training loss over time
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.legend()

# Plot test accuracy over time
plt.subplot(1, 2, 2)
plt.plot(test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy Over Time')
plt.legend()

plt.tight_layout()
plt.show()

# Save the model
torch.save(model.state_dict(), 'best_model_1.pt')

# Calculate sentiment counts in training and testing data
train_counts = train_labels.value_counts()
test_counts = test_labels.value_counts()

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot training data distribution
axs[0].bar(train_counts.index, train_counts.values)
axs[0].set_xticks([0, 1, 2])
axs[0].set_xticklabels(['negative', 'neutral', 'positive'])
axs[0].set_title('Training Data Distribution')
axs[0].set_xlabel('Sentiment')
axs[0].set_ylabel('Count')

# Plot testing data distribution
axs[1].bar(test_counts.index, test_counts.values)
axs[1].set_xticks([0, 1, 2])
axs[1].set_xticklabels(['negative', 'neutral', 'positive'])
axs[1].set_title('Testing Data Distribution')
axs[1].set_xlabel('Sentiment')
axs[1].set_ylabel('Count')

# Show the plots
plt.tight_layout()
plt.show()

"""# References
https://medium.com/nerd-for-tech/fine-tuning-pretrained-bert-for-sentiment-classification-using-transformers-in-python-931ed142e37
"""
Loading