-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
66 lines (57 loc) · 2.42 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
import re
from text_unidecode import unidecode
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Load the saved model and tokenizer
Model_path = "jatinmehra/Smollm2-360M-Essay-Scoring"
model = AutoModelForSequenceClassification.from_pretrained(Model_path)
tokenizer = AutoTokenizer.from_pretrained(Model_path)
# Preprocessing Functions
def resolve_encodings_and_normalize(text: str) -> str:
"""Resolve encoding problems and normalize abnormal characters."""
text = (
text.encode("raw_unicode_escape")
.decode("utf-8", errors="replace_decoding_with_cp1252")
.encode("cp1252", errors="replace_encoding_with_utf8")
.decode("utf-8", errors="replace_decoding_with_cp1252")
)
text = unidecode(text) # Convert accented characters to ASCII
return text
def preprocess_essay_text(text: str) -> str:
"""
Prepares essay text for scoring by cleaning non-essential issues without altering quality indicators.
"""
text = resolve_encodings_and_normalize(text)
text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace
text = re.sub(r'\s+([?.!,"])', r'\1', text) # Remove spaces before punctuation
text = re.sub(r',([^\s])', r', \1', text) # Add space after commas
return text
# Prediction Function
def predict_score(text: str) -> int:
# Preprocess the text
processed_text = preprocess_essay_text(text)
# Tokenize the input text
encoding = tokenizer(
processed_text,
padding='max_length',
truncation=True,
max_length=512,
return_tensors='pt'
)
# Get input IDs and attention mask
input_ids = encoding['input_ids'].squeeze(0).unsqueeze(0) # Add batch dimension
attention_mask = encoding['attention_mask'].squeeze(0).unsqueeze(0) # Add batch dimension
# Move tensors to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Perform inference
model.eval()
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
prediction = torch.argmax(logits, dim=-1).cpu().numpy()
# Convert prediction to score (adjust based on your scoring range)
score = prediction[0] + 1 # Scores range from 1 to 6 | Model predicts from 0 to 5.
return score