-
Notifications
You must be signed in to change notification settings - Fork 0
/
oruga3_gde3_wmd.py
179 lines (143 loc) · 5.39 KB
/
oruga3_gde3_wmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
ORUGA: Optimizing Readability Using Genetic Algorithms
[Martinez-Gil2023a] J. Martinez-Gil, "Optimizing Readability Using Genetic Algorithms", arXiv preprint arXiv:2301.00374, 2023
@author: Jorge Martinez-Gil
"""
from jmetal.algorithm.multiobjective.gde3 import GDE3
from jmetal.util.termination_criterion import StoppingByEvaluations
from readability import Readability
from nltk.corpus import wordnet
from jmetal.core.problem import FloatProblem
from jmetal.core.solution import FloatSolution
import gensim.downloader as api
model = api.load('word2vec-google-news-300')
def listToString(s):
str1 = ""
for ele in s:
str1 += str(ele)
str1 += " "
str1 = str1.replace(' ,', ',')
str1 = str1.replace('_', ' ')
return str1
def Synonym(word, number):
synonyms = []
for syn in wordnet.synsets(word):
for lm in syn.lemmas():
synonyms.append(lm.name())
if (not synonyms):
return -2, word
elif number >= len(synonyms):
return len(synonyms)-1, synonyms[len(synonyms)-1]
else:
return int(number), synonyms[int(number-1)]
def fitness_func1(solution):
#preprocessing
a = 0
for i in index_array:
if index_array[a] <= 0:
solution[a] = 0
a += 1
res2 = text.split()
text_converted = []
index=0
for i in res2:
if solution[index] < 1:
text_converted.append (i)
elif solution[index] >= 1:
number, word = Synonym(i,solution[index])
text_converted.append (word)
else:
print ('Error')
index += 1
result = listToString(text_converted)
r = Readability(result)
return r.ari().score
text = 'The sea moderates the climate and has important roles in the water cycle, carbon cycle, and nitrogen cycle. Humans harnessing and studying the sea have been recorded since ancient times, and evidenced well into prehistory, while its modern scientific study is called oceanography. The most abundant solid dissolved in seawater is sodium chloride. The water also contains salts of magnesium, calcium, potassium, and mercury, amongst many other elements, some in minute concentrations. Salinity varies widely, being lower near the surface and the mouths of large rivers and higher in the depths of the ocean; however, the relative proportions of dissolved salts vary little across the oceans.'
text_array = []
index_array = []
res = text.split()
for i in res:
flag = 0
if ',' in i:
i = i.replace(',', '')
flag = 1
if '.' in i:
i = i.replace('.', '')
flag = 2
if (not i[0].isupper() and len(i) > 3):
number, word = Synonym(i,6)
text_array.append (word)
index_array.append (number)
else:
text_array.append (i)
index_array.append (0)
if flag == 1:
cad = text_array[-1]
text_array.pop()
cad = cad + str(',')
text_array.append (cad)
flag = 0
if flag == 2:
cad = text_array[-1]
text_array.pop()
cad = cad + str('.')
text_array.append (cad)
flag = 0
def obtain_text (solution):
res2 = text.split()
text_converted = []
index=0
for i in res2:
if solution[index] < 1:
text_converted.append (i)
elif solution[index] >= 1:
number, word = Synonym(i,solution[index])
text_converted.append (word.upper())
else:
print ('Error')
index += 1
result = listToString(text_converted)
return result
class Oruga(FloatProblem):
def __init__(self):
super(Oruga, self).__init__()
self.number_of_objectives = 3
self.number_of_variables = len(index_array)
self.number_of_constraints = 0
self.obj_directions = [self.MINIMIZE, self.MINIMIZE]
self.obj_labels = ['f(x)', 'f(y)']
self.lower_bound = self.number_of_variables * [-4]
self.upper_bound = self.number_of_variables * [4]
FloatSolution.lower_bound = self.lower_bound
FloatSolution.upper_bound = self.upper_bound
def evaluate(self, solution: FloatSolution) -> FloatSolution:
source = text
target = obtain_text(solution.variables)
solution.objectives[2] = float (model.wmdistance(source, target))
solution.objectives[1] = fitness_func1(solution.variables)
solution.objectives[0] = len([1 for i in solution.variables if i >= 1])
return solution
def get_name(self):
return 'Oruga'
max_evaluations = 3000
problem = Oruga()
algorithm = GDE3(
problem=problem,
population_size=100,
cr=0.5,
f=0.5,
termination_criterion=StoppingByEvaluations(max_evaluations)
)
algorithm.run()
from jmetal.util.solution import get_non_dominated_solutions, print_function_values_to_file, print_variables_to_file
from jmetal.lab.visualization import Plot
front = get_non_dominated_solutions(algorithm.get_result())
# save to files
print_function_values_to_file(front, 'FUN.GDE3')
print_variables_to_file(front, 'VAR.GDE3')
plot_front = Plot(title='ORUGA', axis_labels=['Words to be replaced', 'Readability Score'])
plot_front.plot(front, label='GDE3', filename='GDE3-ORUGA', format='png')
for solution in front:
# We should call here a function to try to correct the text
print (obtain_text(solution.variables))