-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassificationModelTest.java
135 lines (109 loc) · 4.76 KB
/
ClassificationModelTest.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package activeLearningWithRationales;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import static org.junit.Assert.assertTrue;
public class ClassificationModelTest {
String dataFile = "resources/mallet-sample.msv";
private static final String NEGATIVE_WORDS_TXT = "negative_words.txt";
private static final String POSITIVE_WORDS_TXT = "positive_words.txt";
double delta = Math.pow(10, -7);
@Test
public void testReadDirectory() {
ClassificationModel cm = new ClassificationModel();
InstanceList instances = cm.readDirectory(dataFile);
assertNotNull(instances);
assertEquals(50, instances.size());
}
@Test
public void testSetTfIdf() {
ClassificationModel cm = new ClassificationModel();
InstanceList instances = cm.readDirectory(dataFile);
Alphabet alphabet = instances.getAlphabet();
ClassificationModel.setTfIdf(instances);
// Eddie and Murphy are single occurrence terms. Such non-repeating
// words should have 1*log(50) (base e) as value
int index1 = alphabet.lookupIndex("eddie");
int index2 = alphabet.lookupIndex("murphy");
// film/movie should have 0 as value since it occurs 70 times (> no. of
// instances)
int index3 = alphabet.lookupIndex("film");
int index4 = alphabet.lookupIndex("movie");
for (int i = 0; i < instances.size(); i++) {
Instance currentInstance = instances.get(i);
if (((FeatureVector) currentInstance.getData()).contains("film")) {
assertEquals(0, ((FeatureVector) currentInstance.getData()).getValues()[index3], delta);
}
if (((FeatureVector) currentInstance.getData()).contains("movie")) {
assertEquals(0, ((FeatureVector) currentInstance.getData()).getValues()[index4], delta);
}
}
assertEquals(Math.log(instances.size()), ((FeatureVector) instances.get(0).getData()).getValues()[index1],
delta);
assertEquals(Math.log(instances.size()), ((FeatureVector) instances.get(0).getData()).getValues()[index2],
delta);
}
@Test
public void testGetSentimentWordList() {
ClassificationModel cm = new ClassificationModel();
InstanceList instances = cm.readDirectory(dataFile);
Alphabet alphabet = instances.getAlphabet();
Map<Integer, String> positiveWordsMap = ClassificationModel.getSentimentWordList(alphabet, POSITIVE_WORDS_TXT);
Map<Integer, String> negativeWordsMap = ClassificationModel.getSentimentWordList(alphabet, NEGATIVE_WORDS_TXT);
assertTrue(positiveWordsMap.containsValue("great"));
assertTrue(positiveWordsMap.containsValue("amazing"));
assertTrue(positiveWordsMap.containsValue("fantastic"));
assertEquals(2006, positiveWordsMap.size());
assertTrue(negativeWordsMap.containsValue("bad"));
assertTrue(negativeWordsMap.containsValue("terrible"));
assertTrue(negativeWordsMap.containsValue("sucks"));
assertEquals(4783, negativeWordsMap.size());
}
@Test
public void testSetRFactor() {
ClassificationModel cm = new ClassificationModel();
InstanceList instances = cm.readDirectory(dataFile);
Alphabet alphabet = instances.getAlphabet();
ClassificationModel.setTfIdf(instances);
Map<Integer, String> positiveWordsMap = ClassificationModel.getSentimentWordList(alphabet, POSITIVE_WORDS_TXT);
Map<Integer, String> negativeWordsMap = ClassificationModel.getSentimentWordList(alphabet, NEGATIVE_WORDS_TXT);
List<Double[]> preModifiedValuesList = new ArrayList<>();
for (int i = 0; i < instances.size(); i++) {
Instance currentInstance = instances.get(i);
double[] valuesFromInstances = ((FeatureVector) currentInstance.getData()).getValues();
Double[] valuesToSet = new Double[valuesFromInstances.length];
for (int j = 0; j < valuesFromInstances.length; j++) {
Double currValue = valuesFromInstances[j];
valuesToSet[j] = currValue;
}
preModifiedValuesList.add(valuesToSet);
}
ClassificationModel.setRFactor(instances, positiveWordsMap, negativeWordsMap);
for (int i = 0; i < instances.size(); i++) {
Instance currentInstance = instances.get(i);
Double[] oldValues = preModifiedValuesList.get(i);
double[] newValues = ((FeatureVector) currentInstance.getData()).getValues();
List<Double> factorOfChange = new ArrayList<>();
int numberOfChanges = 0;
for (int j = 0; j < newValues.length; j++) {
if ((oldValues[j].compareTo(newValues[j])) != 0) {
factorOfChange.add(newValues[j] / oldValues[j]);
numberOfChanges++;
} else if (oldValues[j].equals(0.0)) {
numberOfChanges++;
}
}
assertEquals(oldValues.length - 1, numberOfChanges);
for (Double eachFactor : factorOfChange) {
assertEquals(0.1, eachFactor, delta);
}
}
}
}