-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtraining.py
70 lines (43 loc) · 1.55 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import nltk
import pymongo
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["travelbig1"]
mycol = mydb["big_col1"]
mycol3=mydb["topicwords"]
mycol4=mydb["doc_words"]
#mycol3 used for storing topic numbers and the words in eac topic
#mycol2 in travelbig2 database contains numbered documents. this db is to be used for displaying the results
#mycol4 is used to store each word and the document number to which it belongs
def process_text(query):
stop_words = set(stopwords.words('english'))
query=re.sub(r"[^A-Za-z]"," ",query)
word_tokens = word_tokenize(query.lower())
filtered_sentence = [w for w in word_tokens if not w in stop_words]
return filtered_sentence
train_text=[]
docmap={}
i=0
for x in mycol.find():
document=(x['text']).lower()
temp=process_text(document)
for word in temp:
docmap[word]=i
i=i+1
train_text.append(temp)
print("no of entries in train text is %d"%len(train_text))
dictionary = Dictionary(train_text)
corpus = [dictionary.doc2bow(text) for text in train_text]
hdpmodel1 = HdpModel(corpus=corpus, id2word=dictionary)
x=hdpmodel1.show_topics(num_topics=30,num_words=200)
twords={}
for topic,word in x:
twords[str(topic)]=(re.sub('[^A-Za-z ]+', '', word)).split()
mycol3.insert_one(twords)
mycol4.insert_one(docmap)