-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_n-grams.py
57 lines (51 loc) · 1.43 KB
/
find_n-grams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
def preprocess(self):
with open(self, 'r', encoding="utf8", errors='ignore') as source:
for reflections in source:
entry = reflections.rstrip()
case = entry.casefold()
punct = re.sub(r'[^\w\s]', '', case)
numb = ''.join([i for i in punct if not i.isdigit()])
tokens = nltk.wordpunct_tokenize(numb)
return tokens
def bigram_process(path:str):
stop_w = set(stopwords.words('english'))
diction = []
text = preprocess(path)
for words in text:
if words not in stop_w:
diction.append(words)
results = nltk.bigrams(diction)
fdist = nltk.FreqDist(results)
tmp = list()
for k, v in fdist.items():
newt = (v, k)
tmp.append(newt)
tmp = sorted(tmp, reverse=True)
for v, k in tmp[:225]:
print(k, v)
def trigram_process(path:str):
stop_w = set(stopwords.words('english'))
diction = []
text = preprocess(path)
for words in text:
if words not in stop_w:
diction.append(words)
results = nltk.trigrams(diction)
fdist = nltk.FreqDist(results)
tmp = list()
for k, v in fdist.items():
newt = (v, k)
tmp.append(newt)
tmp = sorted(tmp, reverse=True)
for v, k in tmp[:115]:
print(k, v)
print(len(preprocess('corpus1.txt')))
print(len(preprocess('corpus2.txt')))
bigram_process('corpus1.txt')
trigram_process('corpus1.txt')
bigram_process('corpus2.txt')
trigram_process('corpus2.txt')