-
Notifications
You must be signed in to change notification settings - Fork 1
/
3_1.py
53 lines (39 loc) · 1.29 KB
/
3_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# https://www.youtube.com/watch?v=Iu-HLy9hICg, 27:32
# podobienstwo (statystyczne wlasnosci) - przygotowywanie danych
def GetSentences(text):
return [s.strip() for s in re.split('[\.\?\!]', text)]
def GetWords(text):
return re.split('\W+', text)
def GetPunctuations(text):
return "".join(re.findall(r'[' + string.punctuation + ']', text))
def GetByteNGrams(text, n):
return [text[i:i+n] for i in range(0, len(text)-n+1)]
def GetWordNGram(words, n):
# tu w prezentacji byl blad - poprawilismy dla Was :)
return [" ".join(words[i:i+n]) for i in range(0, len(words)-n+1)]
import os
import re
import string
dir = os.getcwd() + "/Process/"
data = {}
wordNGram = 2
byteNGram = 4
for fileName in os.listdir(dir):
with open(dir + fileName, 'r', encoding='UTF-8') as srcFile:
item = {}
text = srcFile.read().lower()
item["text"] = text
# podzial na zdania
item["sentences"] = GetSentences(text)
# podzial na wyrazy
item["words"] = GetWords(text)
# wszystkie uzyte znaki interpunkcyjne
item["punctuations"] = GetPunctuations(text)
# bajtowe n-gramy
item["byteNGrams"] = GetByteNGrams(text, byteNGram)
# slowne n-gramy
item["wordNGrams"] = GetWordNGram(item["words"], wordNGram + 1)
data[fileName] = item
import pprint
pp = pprint.PrettyPrinter(indent=4, width=100)
pp.pprint(data)