-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing
90 lines (87 loc) · 5.16 KB
/
preprocessing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
.gmb.preprocessing.oneHotEncoder:{[table]
//Create k Dummy variables out of categorical:
//table--takes only table with only categorical variables(symbols), cannot start with number
table:`$string table; //converts table to string, then symbol. Helps feed numerical categorical variables
n:count table; // count rows
p:count cols table; //count columns
//adds name of column to table entries
newCols:`$(raze string (count table)#/:cols table) ,'(raze value flip string table); //nondistict list of newColumns
distinctEntries:count each distinct each table[cols table]; //list of distinct entries per column
/take:distinctEntries*count table
shape:(sum distinctEntries ; count table);
table:flip (cols table)!((count cols table)#shape#newCols);
//section creates comparative table for boolean later
uniqueColumnList:distinct newCols;
//create and populate a table with dummy values and dummy variables
dummytable:flip uniqueColumnList!(flip (count table)#(enlist (count uniqueColumnList)#`sym)); //dummytable filled with `sym
dummytable2:0#dummytable;
//populate dummytable entries with its own column names
dummytable:{@[x;y;:;y]}/[dummytable;cols dummytable];
colsTable:cols table; //list of categorical variables
distinctEntries:count each distinct each table[cols table]; //list of distinct entries per variable
take:distinctEntries*count table; // list, unique entries for each column * number of rows
shape:(sum distinctEntries ; count table); //number of columns, rows for final table
dummytable2:(dummytable2 upsert (flip (shape)#raze(take # ' value flip table)));
completeDisjunctiveTable:1*dummytable=dummytable2;
relativeFrequencyTable:completeDisjunctiveTable%(n*p);
columnProfileTable:completeDisjunctiveTable%\:(sum completeDisjunctiveTable)[cols completeDisjunctiveTable];
rowProfileTable:completeDisjunctiveTable%(count cols table);
:(`CDT`RFT`CPT`RPT)!(completeDisjunctiveTable;relativeFrequencyTable;columnProfileTable;rowProfileTable);
};
.gmb.preprocessing.oneHotEncoderKMinusOne:{[table]
//Create k-1 Dummy variables per column out of categorical table
//table--takes only table with only categorical variables(symbols), cannot start with number
table:`$string table; //converts table to string, then symbol. Helps feed numerical categorical variables
n:count table; // count rows
p:count cols table; //count columns
//adds name of column to table entries
newCols:`$(raze string (count table)#/:cols table) ,'(raze value flip string table); //nondistict list of newColumns
distinctEntries:count each distinct each table[cols table]; //list of distinct entries per column
/take:distinctEntries*count table
shape:(sum distinctEntries ; count table);
table:flip (cols table)!((count cols table)#shape#newCols);
//section creates comparative table for boolean later
uniqueColumnList:distinct newCols;
//create and populate a table with dummy values and dummy variables
dummytable:flip uniqueColumnList!(flip (count table)#(enlist (count uniqueColumnList)#`sym)); //dummytable filled with `sym
dummytable2:0#dummytable;
//populate dummytable entries with its own column names
dummytable:{@[x;y;:;y]}/[dummytable;cols dummytable];
colsTable:cols table; //list of categorical variables
distinctEntries:count each distinct each table[cols table]; //list of distinct entries per variable
take:distinctEntries*count table; // list, unique entries for each column * number of rows
shape:(sum distinctEntries ; count table); //number of columns, rows for final table
dummytable2:(dummytable2 upsert (flip (shape)#raze(take # ' value flip table)));
completeDisjunctiveTable:1*dummytable=dummytable2;
//start deleting k-1 columns
distinctEntries[0]:distinctEntries[0]-1;
distinctEntries:(+\)distinctEntries;
colsSuperfluous:(raze cols completeDisjunctiveTable)[distinctEntries];
completeDisjunctiveTable:![completeDisjunctiveTable;();0b; colsSuperfluous];
:completeDisjunctiveTable;
};
.gmb.preprocessing.binnedTable:{[n;table]
//returns the data into n bins
//n--number of desired bins
//table--table with continuous data
:flip(cols table)!(n xrank/: value flip table)
};
.gmb.preprocessing.binaryScale:{[table;binaryList]
//table -- table with only orindinal columns
//binaryList -- binary list (any data type) of the binary features in desired order from 0 to 1
//returns table with binary columns changed to 0 and 1
uniqueList:distinct binaryList; //sanity check
binaryDictionary:uniqueList!(1;0);
:{@[x;y;:;z[x[y]]]}[table;;binaryDictionary]cols table;
};
.gmb.preprocessing.ordinalScale:{[table;ordinalList]
//table -- table with only orindinal columns
//ordinalList -- list of the ordinal features in order from lowest rank to highest
//returns table with ordinal columns scaled 0 to max rank
uniqueList:distinct ordinalList; //sanity check
scaleDictionary:uniqueList!til count uniqueList;
:{@[x;y;:;z[x[y]]]}[table;;scaleDictionary]cols table;
};
gmb.preprocessing.singleValueColumns:{[table]
:(count each distinct each flip table)?1;
};