-
Notifications
You must be signed in to change notification settings - Fork 3
/
BTM.py
114 lines (84 loc) · 2.16 KB
/
BTM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# coding: utf-8
# In[309]:
import pandas as pd
import scipy as sp
import numpy as np
import csv
import sys
import os
import math
import random as rd
def GibbsSampler(widf, docs,wd,T,ITER, lidf=[],lwd=[]):
# In[404]:
D = len(docs)
V = max(wd)
W = len(wd)
ALPHA = 50/float(T)
BETA = 5/float(V)
WBETA = V*BETA
TALPHA = T*ALPHA
# In[406]:
wtp = np.zeros((T,V,V),dtype=np.float128)
totz = np.zeros((T,1),dtype=int)
zw = np.zeros((W,1),dtype=int)
zd = np.zeros((D,T),dtype=int)
# initialise
di = 0
for d in docs:
k = rd.randint(0,T-1)
zd[di,k] += 1
totz[k] += 1
zw[d[0]] = k
for i in range(d[0]+1,d[1]+1):
k = rd.randint(0,T-1)
zd[di,k] += 1
totz[k] +=1
zw[i] = k
wtp[k,wd[i-1]-1,wd[i]-1] += widf[wd[i-1]-1,wd[i]-1]
di+=1
# In[408]:
# Gibbs Sampling
for itr in range(ITER):
print 'iter:',itr
di = 0
for d in docs:
for i in range(d[0],d[1]+1):
u = zw[i]
totz[u]-=1
zd[di,u]-=1
tmp1 = 1
if i > d[0]:
wtp[u,wd[i-1]-1,wd[i]-1] -= widf[wd[i-1]-1,wd[i]-1]
tmp1 = (wtp[:,wd[i-1]-1,wd[i]-1] + BETA)/(np.sum(wtp[:,wd[i-1]-1,:],1)+WBETA)
totprob = 0
prob = (zd[di,:]+ALPHA)/(d[1]-d[0]+1+TALPHA) * tmp1
totprob = sum(prob)
#sample from topic distribution
r = rd.random() * totprob
maxprob = prob[0]
k=0
while(r > maxprob):
#print k,r,maxprob
k+=1
maxprob += prob[k]
zw[i] = k
zd[di,k]+=1
totz[k]+=1
if i > d[0]:
wtp[k,wd[i-1]-1,wd[i]-1] += widf[wd[i-1]-1,wd[i]-1]
di+=1
return(wtp, zw, np.argmax(zd,1), totz)
# In[ ]:
def predictDoc(wtp,wd,docs):
zd = np.zeros(len(docs))
di = 0
prob = np.zeros((len(docs),wtp.shape[0]))
for d in docs:
tmp1 = wtp[:,wd[d[0]:d[1]]-1,wd[d[0]+1:d[1]+1]-1]
if (d[1]-d[0] > 1):
prob[di,:] += np.sum(np.log(tmp1),1)
else :
prob[di,:] += np.log(tmp1)
zd[di]=np.argmax(prob[di,:])
di+=1
return (zd,prob)