-
Notifications
You must be signed in to change notification settings - Fork 0
/
taxonomy wordclouds.py
74 lines (52 loc) · 2.67 KB
/
taxonomy wordclouds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize
# Load the CSV file
data = pd.read_csv('Fairness Taxonomies.csv')
# Display the first few rows of the dataframe along with column names
data.head(), data.columns
# Renaming the columns for better readability
data.rename(columns={'Types of bias': 'Level 1', 'Unnamed: 3': 'Level 2', 'Unnamed: 4': 'Level 3'}, inplace=True)
# Display the updated column names and the first few rows to confirm changes
data.head(), data.columns
# Consolidate all definitions into a single string for the word cloud
all_definitions = data['Definition'].dropna().str.cat(sep=' ')
# Add custom stopwords
custom_stopwords = {'and', 'no', 'yes', 'or', 'bias', 'biases', 'arises'}
# Update the stopwords set with custom stopwords
stopwords = STOPWORDS.union(custom_stopwords)
# Function to generate and display a word cloud with stopwords removed
def generate_wordcloud_and_print_common_words(text, title, stopwords):
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()
# Get frequencies of words and sort them by frequency in descending order
word_frequencies = wordcloud.words_
common_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)[:30]
return common_words
# Generate the word cloud and get the 30 most common words
common_words = generate_wordcloud_and_print_common_words(all_definitions, "All Levels - Combined Bias Types", stopwords)
common_words
# Function to generate word clouds for each level with respective types of biases and definitions
def generate_level_specific_wordclouds(data, levels, stopwords):
for level in levels:
# Filter relevant data for the level
level_data = data[data[level] != '-'][[level, 'Definition']].dropna()
# Concatenate all definitions in the level
level_definitions = level_data['Definition'].str.cat(sep=' ')
# Generate and display the word cloud for the level
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(
level_definitions)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Word Cloud for {level}")
plt.show()
# Generate word clouds for each level
generate_level_specific_wordclouds(data, ['Level 1', 'Level 2', 'Level 3'], stopwords)