-
Notifications
You must be signed in to change notification settings - Fork 0
/
Test.py
98 lines (92 loc) · 2.42 KB
/
Test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import division
import string
import pickle
from bs4 import BeautifulSoup
import csv
import re
import os
import sys
import random
sys.getdefaultencoding()
badcount = 0
path = '/Users/darius/Documents/ComSci2/project4/lyricsmode'
dict1 = {}
words = []
dict2 = {}
wordCount = 0.0
#count the words
def count(line):
global dict1
global dict2
global wordCount
words = line.split(' ')
for i in words:
wordCount +=1.0
for i in range(2,len(words)):
if words[i] in dict1:
if words[i-1] in dict1[words[i]]:
dict1[words[i]][words[i-1]] += 1.0
else:
dict1[words[i]][words[i-1]] = 1.0
#Turn word count into probability by dividing it by the total number of words
dict1[words[i]][words[i-1]] = dict1[words[i]][words[i-1]] / wordCount
else:
dict1[words[i]]={}
#look 2 words back and add that to dictionary 2
if words[i] in dict2:
if words[i-2] in dict2[words[i]]:
dict2[words[i]][words[i-2]] += 1.0
else:
dict2[words[i]][words[i-2]] = 1.0
#Turn word count into probability by dividing it by the total number of words
dict2[words[i]][words[i-2]] = dict2[words[i]][words[i-2]] / wordCount
else:
dict2[words[i]]={}
pickle.dump( dict1, open( "save.p", "wb" ) )
pickle.dump( dict2, open( "save2.p", "wb" ) )
for filename in os.listdir(path):
#filename = filename.decode('utf8')
myfile = path+"/"+filename
#print(myfile)
pretext = ''
t= ''
t2= u''
try:
f = open(myfile, 'rb')
t2 = f.read().decode('utf8', 'ignore')
#t2 = open(myfile, encoding="utf-8").read()
except:
t2 = open(myfile, encoding="latin-1 ").read()
print("fallback to latin 1:", sys.exc_info()[1])
e = sys.exc_info()[0]
print("latin file: \n"+ myfile)
try:
soup = BeautifulSoup(t2, "html.parser")
#soup = BeautifulSoup.BeautifulSoup(content.decode('utf-8','ignore'))
pretext = soup.find_all('pre')
except:
badcount+=1
#if all other checks fail, go here
print("Unexpected error from soup:", sys.exc_info()[1])
if len(pretext) > 0:
for t in pretext:
t = t.get_text()
else:
try:
rawfile = open(myfile, encoding='latin1')
t = rawfile.read()
except:
print("badfile2 :"+ myfile)
t = t.lower()
t = re.sub("[\(\[].*?[\)\]]", "", t)
t = re.sub("[^a-z0-9' \n]*", "", t)
#print(t)
lines = t.split('\n')
lines = lines[6:]
for line in lines:
line = ' '.join(line.split())
if re.match('\w+',line):
newline = '$ ' + line + ' #'
count(newline)
#print(dict1['man'])
# GENERATE OUTPUT