train_ADA.py

'''
- This program was build for academic purpose - ProteinSeq - Protein Sequence Classifier Copyright (C) 2023  Chethiya Galkaduwa
'''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from data_filter import data
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from tqdm import tqdm


# ------ Train Test Split ------------------------------------------------------------------------

# Split Data
X_train, X_test, y_train, y_test = train_test_split(data['sequence'], data['classification'], test_size = 0.2, random_state = 42)


# After splitting the data, it's important to utilize the CountVectorizer to create a dictionary composed from the training dataset. 
vect = CountVectorizer( analyzer = 'char_wb', ngram_range = (4 , 4) )

# Fit and Transform CountVectorizer
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

# Print a few of the features
print(vect.get_feature_names_out()[-20:])

# ------ Machine Learning Models | Adaptive boosting ------

# Make a prediction dictionary to store accuracies
prediction = dict()

# set the number of epochs
epochs = 2

# fit the model for each epoch and update the progress bar
for epoch in range(epochs):
    model = AdaBoostClassifier()
    model.fit(X_train_df, y_train)
    ADA_pred = model.predict(X_test_df)
    acc = accuracy_score(ADA_pred, y_test)
    prediction["Adaboost"] = acc
    print (acc)