py_data_loader/process_sample.py at master · al1357/py_data_loader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-

'''
Tasks:
    1) For ordinal encoding change order of classes, e.g. A=>1 B=>2 and A=>2 B=>1, train a network and see if something changes

'''


"""
Created on Mon Dec 30 09:50:01 2019

@author: al2357
"""
import pandas as pd
from tensorflow.keras.utils import to_categorical
from process_helper import *
import numpy as np

#pd.set_option('display.max_rows', 60)
#pd.set_option('display.max_columns', 4)
#pd.set_option('display.width', 80)

##############################
## 1) Read data
data = pd.read_csv('data/121229-tauris_estate.csv')

## gets sample data headers
#print(data.head())
#print(data.tail())

## sums null(nan?) values in each column
#print(data.isnull().sum())

## gets number and kind of unique entries
#data['kategoria'].value_counts()

## stats about given column
#data['rok-produkcji'].describe()


##############################
## 2a) Drop unnecessary properties
data.drop(["kategoria", "typ", "marka-pojazdu", "model-pojazdu", "vin", "naped", "filtr-czastek-stalych", "mozliwosc-finansowania", "leasing", "url", "stan"], axis=1, inplace=True)


##############################
## 2b) Deal with missing data
for idx, val in data.loc[data['skrzynia-biegow'] == '0'].iterrows():
    if(val['rodzaj-paliwa'] == 'hybryda' or val['rodzaj-paliwa'] == 'eletryczny'):
        data.loc[idx, 'skrzynia-biegow'] = 'automatyczna-cvt'
    else:
        data.loc[idx, 'skrzynia-biegow'] = 'manualna'
    #end if
#end for

for idx, val in data.loc[data['kraj-pochodzenia'] == '0'].iterrows():
    data.loc[idx, 'kraj-pochodzenia'] = 'inny'
#end for


##############################
# 3a) Remove text from numbers, convert to ints. e.g. 2300km to 2300
replace = {"przebieg":"non_num", "moc":"non_num", "price":"non_num", "pojemnosc-skokowa":"cm3"}
data = cols_to_numeric(data, replace)


##############################
## 4a) Convert caregories to ints; option I - replace
#columns_to_categorize = ["oferta-od", "wersja", "rodzaj-paliwa", "skrzynia-biegow", "kolor", "kraj-pochodzenia", "zarejestrowany-w-polsce", "bezwypadkowy", "serwisowany-w-aso"]
#(data, replace_map) = cat_to_int_replace(data, columns_to_categorize)


##############################
## 4b) Convert categories to ints; option II - use scikit LabelEncoder(converting target var - label); used on m x 1
#columns_to_categorize = ["oferta-od", "wersja", "rodzaj-paliwa", "skrzynia-biegow", "kolor", "kraj-pochodzenia", "zarejestrowany-w-polsce", "bezwypadkowy", "serwisowany-w-aso"]
#(data, le_classes) = cat_to_int_label_enc(data, columns_to_categorize)


##############################
## 4c) Categories to int - use OrdinalEncoder(converting features); used on m x n
#from sklearn.preprocessing import OrdinalEncoder
#columns_to_categorize = ["oferta-od", "wersja", "rodzaj-paliwa", "skrzynia-biegow", "typ", "kolor", "kraj-pochodzenia", "zarejestrowany-w-polsce", "bezwypadkowy", "serwisowany-w-aso"]
#data = cat_to_int_ord_enc(data, columns_to_categorize)


##############################
# 4d) Covert categorical data to one-hot encoding
# .fit(); .transform(); .inverse_transform(); .categories_
columns_to_encode = ["oferta-od", "wersja", "rodzaj-paliwa", "skrzynia-biegow", "kolor", "kraj-pochodzenia", "zarejestrowany-w-polsce", "bezwypadkowy", "serwisowany-w-aso"]
cats_to_drop = {"kolor" : "innykolor", "kraj-pochodzenia" : "inny", "serwisowany-w-aso" : "0"}
(data, x, y) = cat_to_oh_enc(data, columns_to_encode, cats_to_drop)


##############################
# 5a) Split into data and labels
labels = data[["price"]]
data.drop(["price"], inplace=True, axis=1)


##############################
# 5b) Split into train and test sets
import math
labels = labels.values
data = data.values

m = labels.shape[0]
m_test = math.floor(m * 0.2)
labels_test = labels[0:m_test, :]
labels_train = labels[m_test:, :]
x_test = data[0:m_test, :]
x_train = data[m_test:, :]


##############################
# *I) Save replace_map in a csv file
#with open("data/replace_map.csv", "w") as file:
#    w = csv.writer(file)
#    for col, categories in replace_map.items():
#        w.writerow(["*", col])
#        print(["*", col])
#        for cat_txt, cat_int in categories.items():
#            w.writerow([cat_txt, cat_int])
#            print([cat_txt, cat_int])
#    file.close()


##############################
# II*) Save data in a CSV file
#data.to_csv("data/200101-tauris_estate-p.csv", index=False, encoding='utf8')