utils.py

import torch
import re
from text_unidecode import unidecode
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
Model_path = "jatinmehra/Smollm2-360M-Essay-Scoring" 
model = AutoModelForSequenceClassification.from_pretrained(Model_path)
tokenizer = AutoTokenizer.from_pretrained(Model_path)

# Preprocessing Functions
def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve encoding problems and normalize abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)  # Convert accented characters to ASCII
    return text

def preprocess_essay_text(text: str) -> str:
    """
    Prepares essay text for scoring by cleaning non-essential issues without altering quality indicators.
    """
    text = resolve_encodings_and_normalize(text)
    text = re.sub(r'\s+', ' ', text.strip())  # Normalize whitespace
    text = re.sub(r'\s+([?.!,"])', r'\1', text)  # Remove spaces before punctuation
    text = re.sub(r',([^\s])', r', \1', text)    # Add space after commas
    return text

# Prediction Function
def predict_score(text: str) -> int:
    # Preprocess the text
    processed_text = preprocess_essay_text(text)

    # Tokenize the input text
    encoding = tokenizer(
        processed_text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    # Get input IDs and attention mask
    input_ids = encoding['input_ids'].squeeze(0).unsqueeze(0)  # Add batch dimension
    attention_mask = encoding['attention_mask'].squeeze(0).unsqueeze(0)  # Add batch dimension

    # Move tensors to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).cpu().numpy()

    # Convert prediction to score (adjust based on your scoring range)
    score = prediction[0] + 1  # Scores range from 1 to 6 | Model predicts from 0 to 5.
    return score