-
Notifications
You must be signed in to change notification settings - Fork 0
/
gaussian_model.py
70 lines (52 loc) · 2.85 KB
/
gaussian_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import pandas as pd
import math
import sys
import scipy.stats # multivariate_normal
class GaussianModel:
def __init__(self, phone2features, index2features, parameters):
# take parameters and all features
# assign probabilities of each state producing each feature according to given parameters
self.parameters = parameters # dict_keys(['mean', 'sd'])
self.phone2features = phone2features
self.states = phone2features.keys() # ['s', 'e', 'r', 'o', 'u', 'n', 'd', 't', 'k', 'a', 'i', 'ts', 'b']
self.index2features = index2features
self.dimensions = len(self.index2features[0]) # number of feature dimension (e.g. 88)
self.priors = self.calculate_priors()
def calculate_priors(self): # the probability of randomly picking a class
priors = {}
total_data = len(self.index2features)
for state in self.states:
prior = len(self.phone2features[state]) / total_data # proportion of samples initially assigned to each state
priors[state] = prior
total = np.sum([p for p in priors.values()])
if total < 0.99 or total > 1.001:
raise ValueError(f'priors for all classes (states) should sum up to 1.0\npriors sum up to: {total}')
return priors
def get_emission_probabilities(self):
emission_prob = pd.DataFrame(0, index=self.states, columns=self.index2features.keys())
for index in self.index2features.keys():
for state in self.states:
posterior = self.calculate_posterior(state, self.index2features[index])
emission_prob.loc[state, index] = posterior # np.exp(posterior)
#break
#break
#emission_prob['sum'] = emission_prob.sum(axis=1)
# emission_prob is a dataframe with log probs of each state emitting each feature vector
return emission_prob
def calculate_posterior(self, state, x):
# p(class|x) = (p(x|class) p(class)) / p(x)
# = likelihood * prior / evidence
# p(x) acts as a scale factor to ensure that the p(x|class) for all classes sum up to one
# because p(x) is the same for all classes, it is not computed for classification purposes
log_pdf = self.calculate_logpdf(state, x) # p(x|class)
posterior = log_pdf + np.log(self.priors[state])
return posterior
def calculate_logpdf(self, state, x):
covariance = np.identity(self.dimensions) # np.zeros([self.dimensions, self.dimensions])
for index in range(self.dimensions):
covariance[index][index] = self.parameters['sd'][state][index]**2
mean = self.parameters['mean'][state]
# allow_singular = True (?)
log_pdf = scipy.stats.multivariate_normal.logpdf(x, mean=mean, cov=covariance, allow_singular=True)
return log_pdf