-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_humans_data.py
69 lines (55 loc) · 2.07 KB
/
process_humans_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
import string
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_distances
import matplotlib.pyplot as plt
# Download NLTK resources
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
# Define stop words, lemmatizer
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
from elaboration import *
from similarity import *
from flexibility import *
# Compute Elaboration
# with or without stop_words
def calculate_elaboration(sentence, remove_stop_words):
# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)
if remove_stop_words:
# Remove stop words
tokens = [word for word in tokens if word.lower() not in stop_words]
# Count the remaining words
return len(tokens)
# Compute all metrics for humans at once
def compute_all_metrics(df, objects, dict_kw_coeff, num_topics):
# Compute Elaboration
#print("Compute elaboration...")
#df['elaboration'] = df['response'].apply(lambda x: calculate_elaboration(x, remove_stop_words = False))
print("Compute elaboration without stop words...")
df['elaboration_SW'] = df['response'].apply(lambda x: calculate_elaboration(x, remove_stop_words = True))
# Compute Similarity
print("Compute dissimilarity...")
embeddings_model_name = "distilbert-base-uncased"
df = compute_dissimilarity(df, embeddings_model_name)
# Flexibility
print("Compute flexibility...")
df = compute_flexibility_score(df, dict_kw_coeff, num_topics, objects)
# Flexibility augmented
print("Compute flexibility augmented...")
df = compute_flexibility_augmented_score(df, dict_kw_coeff, num_topics, objects)
return df