-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_processing.py
57 lines (50 loc) · 1.98 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Contains data processing function, e.g. converting from matlab file to
python object, learning a logistic regression for the image of the letters etc
"""
import os
import scipy.io
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
def read_PA3Data(file='PA3Data.mat'):
"""Reads matlab data file containing the image of letters and returns it
as a list of list. The image of a letter and its value are stored in a tuble.
A list of such tubles builds a word and a list of words are returned
"""
# Set path to location of current python file
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# load an preprocess data
raw_data = scipy.io.loadmat(file)["allWords"]
words = [[(let[0][0], chr(let[0][1][0][0] + ord("a") - 1))
for let in word[0]]
for word in raw_data]
return words
def read_PA3Models_pairwise(file="PA3Models.mat"):
"""Reads matlab data file containing the probability that two letters apear
together.
Returns a 2darray containing these probabilitys"""
# Set path to location of current python file
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# load an preprocess data
return scipy.io.loadmat(file)["pairwiseModel"]
def train_logreg_model(words):
"""Trains a logistig regression model to make the assigment between the
image of letters to the actual character.
Returns the model, model.predict_propab returns the probability for each
letter
"""
letters = [l for word in words for l in word]
X, Y = zip(*[(np.array(x.flat), l) for x,l in letters])
logreg = linear_model.LogisticRegression()
logreg.fit(np.array(X), np.array(Y))
return logreg
def show_letter(letter):
"""Displays the immage of a letter"""
plt.imshow(letter[0].T, cmap="Greys") #, interpolation='nearest')
plt.show()
def main():
words = read_PA3Data()
mod = train_logreg_model(words[:-10])
pass
if __name__ == '__main__':
main()