-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcaesar.js
190 lines (187 loc) · 6.69 KB
/
caesar.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
//import sbd module
const tokenizer = require('sbd');
class Caesar {
constructor() {
this.caesarUrl = "https://www.thelatinlibrary.com/caesar/gallic/"
}
/**
* Removes newlines and stuff between square brackets from a string
* @param {string} text
*/
cleanText(text) {
return text.replace(/\r?\n|\r/g, "").replace(/ *\[[^\]]*]/g, '');
}
/**
* Removes punctuation in a string.
* @param {string} text
*/
removePunctuation(text) {
return text.replace(/[.,\/#!$%\^&\*;:"{}=\-_`~()]/g,"");
}
/**
* Gets de bello gallico text.
* from https://www.thelatinlibrary.com/caesar/
*/
async getCaesarText() {
let caesarText = [];
//fetch each html file by interating over them in a loop.
for (let i = 1; i <= 8; i++) {
// make indexes start at 0.
caesarText[i - 1] = await this.getCaesarBook(i);
}
return caesarText;
}
/**
* Gets a certain html file and returns the text.
* based on the parameter.
* from https://www.thelatinlibrary.com/caesar/
* @param {string} i
*/
async getCaesarBook(i) {
//the fetch funtion sends a GET http request to fetch the text.
//I've set it to return an object, which contains the full text, an object containing the text and chapter number for each chapter, and the book number.
let caesarBook = await fetch(`${this.caesarUrl}gall${i}.shtml`).then(async (response) => {
let text = await response.text();
//initiate book object
let book = {
text: "",
chapters: {},
bookNumber: i
};
//load text in cheerio.
let $ = cheerio.load(text);
//remove empty p elements.
$( 'p:empty' ).remove();
//separate each chapter of the text.
//for some reason the first book doesn't have .page_wrapper, so I have to use ternary expressions to change the selector for books 2-8.
for (let j = (i !== 8 ? 3 : 4); j <= ( i == 1 ? $(".page_wrapper p").length : $("body p").length); j++) {
//delete the square brackets and any numbers inside them at the beginning of each chapter and remove linebreaks, and trim spaces.
let chapterText = this.cleanText($(`${i == 1 ? ".page_wrapper" : "body"} p:nth-child(${j})`).text());
//fill objects with data.
book.text = book.text.concat(chapterText);
//convert all text to lowercase letters.
book.text = book.text.toLowerCase();
book.chapters[j - 3] = {
chapterText: chapterText,
chapterNumber: j - 3
}
}
return book;
})
return caesarBook;
}
/**
* Gets the entire text object.
* @param {string} caesarText
*/
getFullText(caesarText) {
let fullText = "";
//put the entire text into one string.
for (let book of caesarText) {
fullText = fullText.concat(" " + book.text);
}
return fullText;
}
/**
* Splits the entire text into sentences.
* @param {string} caesarText
*/
getSentences(caesarText) {
//get full text
let fullText = this.getFullText(caesarText);
//using an external module for parsing sentences, as some sentences are tricky with names, like "C. Caesare"
let sentences = tokenizer.sentences(fullText, {
//tell the tokenizer to note these various latin abbreviations.
"abbreviations": ["Cn","cn", "C", "kal", "apr", "id", "ii", "iii", "iv", "v", "vi", "vii", "viii", "viiii", "x", "xi", "xii", "xiii", "xiiii", "XV", "ser", "ap"]
});
return sentences;
}
/**
* Splits the entire text into sentences, and outputs word and character count.
* @param {string} caesarText
*/
getSentencesWithWordAndCharacterCount(caesarText) {
//get sentences
let sentences = this.getSentences(caesarText);
//add data
return sentences.map((sentence) => ({"sentence": sentence, "wordCount": this.getWords(sentence).length, "charCount": sentence.length}));
}
/**
* Gets the average sentence length by the number of words.
* @param {string} caesarText
*/
getAverageSentenceLengthByWords(caesarText) {
//get the sentences of the entire text.
let sentences = this.getSentences(caesarText);
//get the total word count
let totalWordCount = 0;
for (let sentence of sentences) {
//split the word by each space, and count up how many words.
let wordCount = sentence.split(" ").length;
//equivilant to totalWordCount = totalWordCount + wordCount;
totalWordCount += wordCount;
}
//return the totalWordCount divided by the total number of sentences.
return totalWordCount / sentences.length;
}
/**
* Gets the average sentence length by the number of characters, not including spaces.
* @param {string} caesarText
*/
getAverageSentenceLengthByCharacters(caesarText) {
//get the sentences of the entire text.
let sentences = this.getSentences(caesarText);
//get the total character count
let totalCharCount = 0;
for (let sentence of sentences) {
//split the word by each character.
let words = sentence.split("");
//filter out spaces.
words = words.filter((char) => (char !== " " && char !== ""));
//equivilant to totalCharCount = totalCharCount + words.length;
totalCharCount += words.length;
}
//return the totalCharCount divided by the total number of characters.
return totalCharCount / sentences.length;
}
/**
* Gets an array of every word in the text.
* @param {string} caesarText
*/
getAllWords(caesarText) {
//get full text
let fullText = this.getFullText(caesarText);
//remove punctuation
fullText = this.removePunctuation(fullText);
//get words and filter out spaces and quotations
let words = fullText.split(" ").filter(char => (char !== " " && char !== ""));
return words;
}
/**
* Gets an array of every word in a string.
* @param {string} str
*/
getWords(str) {
//remove punctuation
str = this.removePunctuation(str);
//get words and filter out spaces and quotations
let words = str.split(" ").filter(char => (char !== " " && char !== ""));
return words;
}
/**
* Gets most common words.
* @param {array} words
*/
getMostCommonWords(words, stockWords = []) {
//filter out stock words
words = words.filter((word) => !stockWords.includes(word));
//https://stackoverflow.com/a/35101824/10972030 -> get fequencies of each word
let wordFrequencies = new Map([...new Set(words)].map(
x => [x, words.filter(y => y === x).length]
));
//get the entries iterator, and use spread syntax to convert to array. Sort by frequency.
let wordFrequenciesSorted = [...wordFrequencies.entries()].sort((ab, bc) => bc[1] - ab[1] || bc[0] - ab[0]);
return wordFrequenciesSorted;
}
}
module.exports = Caesar;