From 8004eb0179b5d6af04698aa43c2bb34033819005 Mon Sep 17 00:00:00 2001 From: Salil Apte Date: Mon, 9 Dec 2024 06:51:53 -0500 Subject: [PATCH] Update placeholder_test.py --- tests/placeholder_test.py | 211 +++++++++++++------------------------- 1 file changed, 72 insertions(+), 139 deletions(-) diff --git a/tests/placeholder_test.py b/tests/placeholder_test.py index a520406..ef8c057 100644 --- a/tests/placeholder_test.py +++ b/tests/placeholder_test.py @@ -1,143 +1,76 @@ +import pytest import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import re +import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression -from sklearn.metrics import ( - accuracy_score, - classification_report, - confusion_matrix, - roc_curve, - auc, - precision_recall_curve, -) +from sklearn.metrics import accuracy_score + +# Mock data for testing +@pytest.fixture +def sample_data(): + return pd.DataFrame({ + 'text': [ + "This is a test sentence.", + "Another example text from an AI model.", + "Humans write text like this.", + "AI can generate sentences too.", + "What makes AI different is its patterns." + ], + 'generated': [0, 1, 0, 1, 1] # 0 = Human, 1 = AI + }) + + +# Test for data preprocessing +def test_preprocessing(sample_data): + # Example preprocessing function + def clean_text(text): + return text.lower().replace(".", "").replace(",", "") + + sample_data['cleaned_text'] = sample_data['text'].apply(clean_text) + + assert 'cleaned_text' in sample_data.columns + assert sample_data['cleaned_text'][0] == "this is a test sentence" + + +# Test for TF-IDF vectorization +def test_tfidf_vectorization(sample_data): + vectorizer = TfidfVectorizer(max_features=500) + X = vectorizer.fit_transform(sample_data['text']) + + assert X.shape[0] == len(sample_data) # Check rows + assert X.shape[1] <= 500 # Check feature limit + + +# Test for Logistic Regression training +def test_logistic_regression_training(sample_data): + # Vectorize text + vectorizer = TfidfVectorizer(max_features=500) + X = vectorizer.fit_transform(sample_data['text']) + y = sample_data['generated'] + + # Train model + model = LogisticRegression(max_iter=1000, solver='liblinear') + model.fit(X, y) + + # Ensure the model has been trained + assert len(model.coef_[0]) == X.shape[1] + + +# Test for accuracy calculation +def test_model_accuracy(sample_data): + # Vectorize text + vectorizer = TfidfVectorizer(max_features=500) + X = vectorizer.fit_transform(sample_data['text']) + y = sample_data['generated'] + + # Train model + model = LogisticRegression(max_iter=1000, solver='liblinear') + model.fit(X, y) + + # Predict on the same data + y_pred = model.predict(X) + + # Calculate accuracy + accuracy = accuracy_score(y, y_pred) + assert accuracy > 0.5 # Ensure accuracy is reasonable -# Create mock data -mock_data = { - 'text': [ - "This is an example of human-written text.", - "The AI generated this piece of text for testing.", - "Humans write text like this for documentation purposes.", - "AI can create very convincing human-like text examples.", - "This is another piece of human-written content.", - "AI has been used to generate this sample text." - ], - 'generated': [0, 1, 0, 1, 0, 1] # 0 for human, 1 for AI -} - -# Convert mock data to a DataFrame -df = pd.DataFrame(mock_data) - -# Total entries -print("Total entries:", df.count().sum()) - -# A brief overview -print(df.describe()) - -# Class distribution in 'generated' column -class_distribution = df['generated'].value_counts() -print("\nClass Distribution:\n", class_distribution) - -# Function to clean text without NLTK -def clean_text_no_nltk(text): - text = text.lower() - text = re.sub(r'[^a-z\s]', '', text) - stop_words = {'the', 'and', 'is', 'in', 'to', 'of', 'for', 'it', 'on', 'this', 'that', 'with', 'a', 'as'} - tokens = text.split() - tokens = [word for word in tokens if word not in stop_words] - return ' '.join(tokens) - - -# Apply the cleaning function to the 'text' column -df['cleaned_text'] = df['text'].apply(clean_text_no_nltk) - -# Display the first few rows to verify cleaning -print(df[['text', 'cleaned_text']].head()) - -# Vectorize text data -vectorizer = TfidfVectorizer(max_features=5000) -X = vectorizer.fit_transform(df['cleaned_text']) -y = df['generated'] - -# Split data -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) - -# Logistic Regression -log_model = LogisticRegression(max_iter=1000, solver='liblinear') -log_model.fit(X_train, y_train) - -# Predictions and evaluation -y_pred = log_model.predict(X_test) -y_score = log_model.decision_function(X_test) -accuracy = accuracy_score(y_test, y_pred) -print(f"Logistic Regression Accuracy: {accuracy}") -print("Classification Report:") -print(classification_report(y_test, y_pred, zero_division=0)) - -# Visualization: Confusion Matrix -cm = confusion_matrix(y_test, y_pred) -plt.figure(figsize=(6, 4)) -sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Human', 'AI'], yticklabels=['Human', 'AI']) -plt.title('Confusion Matrix') -plt.xlabel('Predicted Label') -plt.ylabel('True Label') -plt.show() - -# Visualization: ROC Curve -fpr, tpr, _ = roc_curve(y_test, y_score) -roc_auc = auc(fpr, tpr) -plt.figure(figsize=(6, 4)) -plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})') -plt.plot([0, 1], [0, 1], color='navy', linestyle='--') -plt.title('ROC Curve') -plt.xlabel('False Positive Rate') -plt.ylabel('True Positive Rate') -plt.legend(loc="lower right") -plt.show() - -# Visualization: Precision-Recall Curve -precision, recall, _ = precision_recall_curve(y_test, y_score) -plt.figure(figsize=(6, 4)) -plt.plot(recall, precision, marker='.', label='Precision-Recall') -plt.fill_between(recall, precision, alpha=0.3, color='blue') -plt.title('Precision-Recall Curve') -plt.xlabel('Recall') -plt.ylabel('Precision') -plt.legend() -plt.show() - -# Top Positive and Negative Features -feature_names = vectorizer.get_feature_names_out() -coefficients = log_model.coef_[0] - -coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients}) -top_positive = coef_df.nlargest(10, 'Coefficient') -top_negative = coef_df.nsmallest(10, 'Coefficient') - -# Plot top positive coefficients -plt.figure(figsize=(8, 5)) -sns.barplot(x='Coefficient', y='Feature', data=top_positive, palette='Greens', hue='Feature', legend=False) -plt.title('Top 10 Positive Features (AI Indicating Words)') -plt.xlabel('Coefficient Value') -plt.ylabel('Feature') -plt.show() - -# Plot top negative coefficients -plt.figure(figsize=(8, 5)) -sns.barplot(x='Coefficient', y='Feature', data=top_negative, palette='Reds', hue='Feature', legend=False) -plt.title('Top 10 Negative Features (Human Indicating Words)') -plt.xlabel('Coefficient Value') -plt.ylabel('Feature') -plt.show() - -# Visualization: Histogram of Predictions -plt.figure(figsize=(6, 4)) -sns.histplot(y_score, kde=True, bins=30, color='purple') -plt.axvline(0, color='red', linestyle='--', label='Decision Boundary') -plt.title('Histogram of Decision Function Scores') -plt.xlabel('Decision Score') -plt.ylabel('Frequency') -plt.legend() -plt.show()