-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathData_Prep.sas
371 lines (285 loc) · 8.44 KB
/
Data_Prep.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
Libname Group5 "/home/u58677578/BAN110/Project";
/* Loading Data set from txt file */
Data Group5.Auto_Mpg;
Infile "/home/u58677578/BAN110/Project/auto-mpg.data";
Informat Name $30.;
INput @1 Mpg 4.
@8 Cylinders 1.
@12 Displacement 5.
@23 Horsepower 5.
@34 Weight 5.
@45 Acceleration 4.
@52 Year 2.
@56 Origin 1.
@58 Name & $30.;
Format Mpg 4.1 Displacement 5.1 Horsepower 5.1 Weight 6.1
Acceleration 4.1;
Run;
/* Checking Meta Data Description */
Proc Contents Data=Group5.Auto_Mpg;
Run;
Title 'Lisitng First 10 Observations';
Proc Print Data = Group5.Auto_Mpg (obs = 5);
Run;
/* Descriptive statistics and distribution of Targeet variable MPG */
Title 'Descrpitive Statitics for MPG';
Proc Means Data = Group5.Auto_Mpg;
Var mpg;
Run;
Title 'Histogram of MPG';
Proc Sgplot Data = Group5.Auto_Mpg;
Histogram Mpg;
Density Mpg;
Density Mpg / type=kernel;
Run;
/* Categorical Values */
options nolabel;
Title 'Listing Frequencies for Cylinders';
Proc Freq Data=Group5.Auto_Mpg;
Tables Cylinders Year Origin / nocum missing;
Run;
Title;
/* Checking for missing categorical values using informat method */
Proc Format;
Value Origin_Check
1,2,3 = 'Valid'
other = 'Invalid';
Value Cyl_Check
3,4,5,6,8 = 'Valid'
other = 'Invalid';
Value Year_Check
70-82 = 'Valid'
other = 'Invalid';
Run;
Data _null_;
File Print;
Set Group5.Auto_Mpg (Keep = Name Cylinders Year Origin);
If put(Cylinders, Cyl_Check.) = 'Invalid' then put
'Missing observation of Cylinders = ' _n_ name Cylinders =;
Else if Put(Year, Year_Check.) = 'Invalid' then put
'Missing observation of Year = ' _n_ name Year =;
Else if put(Origin,Origin_Check.) = 'Invalid' then put
'Missing observation of Origin = ' _n_ name Origin = ;
Run;
Title 'Checking for Missing values of Categorical variables';
Proc Freq Data=Group5.Auto_Mpg;
Tables Cylinders Year Origin / nocum nopercent;
Format Cylinders Cyl_Check. Year Year_Check. Origin Origin_Check.;
Run;
/* Converting Date from 2 digit number to Date9. format */
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
Year_new = Cat('03/01/19',Year);
Year = year(input(Year_new, mmddyy10.));
Drop Year_new;
Run;
Title 'Printing first 5 observations';
Proc Print Data = Group5.Auto_Mpg (obs = 5);
Var Name Year;
Run;
/* Derive Vehicle Make and Model from Name */
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
Name = Propcase(Compress(Name,'"'));
Array model_n [6] $20. Model1-Model6;
Do i = 1 to 6;
Model_n [i] = compress(Scan(Name,i),"'");
End;
If _n_ = 293 then Model3 = '';
Brand = Model1;
Model = Catx('',Model2,model3,model4,model5,model6);
Drop Model1-Model6 Name i;
Run;
/* Checking Errors in Brand */
Title 'Checking errors in Brand';
Proc Freq Data = Group5.Auto_Mpg;
Tables Brand / nocum nopercent;
Run;
/*Correcting Spelling errors for variable Brand */
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
Brand = Tranwrd(Brand,'Vw','Volkswagen');
Brand = Tranwrd(Brand,'Vokswagen','Volkswagen');
Brand = Tranwrd(Brand,'Chevroelt','Chevrolet');
Brand = Tranwrd(Brand,'Chevy','Chevrolet');
Brand = Tranwrd(Brand,'Maxda','Mazda');
Brand = Tranwrd(Brand,'Toyouta','Toyota');
Run;
Data Group5.Auto_Mpg;
Retain Brand Model Cylinders Year Origin MPG Displacement
Horsepower Weight Acceleration;
Set Group5.Auto_Mpg;
Run;
Title 'Checking Corrected Brands';
Proc Freq Data = Group5.Auto_Mpg;
Tables Brand*Origin / nocum nopercent norow nocol;
Run;
/* Numerical Variables */
options nolabel;
Proc MEans Data = Group5.Auto_Mpg
n nmiss min max mean median mode stddev var ;
Var mpg acceleration displacement weight horsepower;
Run;
/* Checking Missing Numeric Observations */
Title 'Identifying Missing numeric values';
Data _null_;
File print;
Set Group5.Auto_Mpg;
Array Numeric [*] _NUMERIC_;
Do i = 1 to Dim(Numeric);
If missing(numeric(i)) then put
'Missing Observation ' Brand = Model = Mpg = Cylinders = Displacement = Horsepower =
Weight = Acceleration = ;
End;
Run;
/* Checking Mean Horsepower for various Cylinder categories */
Proc Means Data = Group5.Auto_Mpg;
Class Cylinders;
Var Horsepower;
Run;
/* Replacing missing horsepower with mean horespower grouped by Cylinders */
Proc Sort Data = Group5.Auto_Mpg; by Cylinders; Run;
Proc Stdize data = Group5.Auto_Mpg out = Group5.Auto_Mpg
reponly method = mean;
by cylinders;
Run;
/* Calculatin a new variable Power-Weight Ratio */
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
PWR = horsepower/weight;
Run;
/* Detecting outliers before Imputing missing values of Horsepower */
Proc Univariate Data = Group5.Auto_Mpg plots;
Var mpg acceleration displacement weight horsepower pwr;
Run;
/* After Checking we see variable Acceleration has normal distribution. Hence, we will use
Standard Deviation method to detect Outliers */
Proc Means Data = Group5.Auto_Mpg noprint;
Var Acceleration;
Output out = Means (drop = _type_ _freq_)
Mean =
Std = / autoname;
Run;
Proc Means Data = Group5.Auto_Mpg noprint;
Var pwr;
Output out = IQR (drop = _type_ _freq_)
Q1 =
Q3 =
Qrange = / autoname;
Run;
/* Detecting OUtliers for Power-Weight Ration using Inter Quartile Range */
Title 'Listing Outliers for Power-Weight Ratio';
Data _NULL_;
Set Group5.Auto_Mpg (keep = pwr Brand Model);
File Print;
If _n_ =1 then set IQR;
If pwr < pwr_Q1 - 1.5*pwr_Qrange or
pwr > pwr_Q3 + 1.5*pwr_Qrange then
Put 'Outlier detected for ' Brand Model ' Power-Weight ratio = ' pwr;
Run;
Title;
Title 'Listing Outliers for Power-Weight Ratio';
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
If _n_ =1 then set IQR;
If pwr < pwr_Q1 - 1.5*pwr_Qrange or
pwr > pwr_Q3 + 1.5*pwr then delete;
Drop pwr_Q1 pwr_Q3 pwr_Qrange;
Run;
Title;
/* Detecting Outliers for Acceleration */
Title 'Listing Outliers for Acceleration';
Data _NULL_;
Set Group5.Auto_Mpg (keep = Acceleration Brand Model);
File Print;
If _n_ =1 then set Means;
If Acceleration <= Acceleration_Mean - 2*Acceleration_StdDev or
Acceleration > Acceleration_Mean + 2*Acceleration_StdDev then
Put 'Outlier detected for ' Brand Model ' where Acceleration = ' Acceleration;
Run;
Title 'Listing Outliers for Acceleration';
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
If _N_ = 1 then set means;
If Acceleration < Acceleration_Mean - 2*Acceleration_StdDev or
Acceleration > Acceleration_Mean + 2*Acceleration_StdDev then delete;
Drop Acceleration_MEan Acceleration_StdDev;
Run;
Proc Univariate Data = Group5.Auto_Mpg plots;
Var Acceleration;
Run;
/* Checking Skewness of Variable Horsepower using QQplot and Histogram */
Title 'Histogram for Horsepower';
Proc sGplot Data = Group5.Auto_Mpg;
Histogram horsepower;
Density horsepower;
Density horsepower / type=kernel;
Run;
Proc Gchart Data = Group5.Auto_Mpg;
vbar horsepower;
Run;
Title 'QQ-Plot for Horsepower';
Proc Univariate Data = Group5.Auto_Mpg;
Var horsepower;
qqplot ;
Run;
/* Applying Log10 transformation on Horsepower */
Data Log_test;
Set Group5.Auto_Mpg;
LogHP = Log(horsepower);
Run;
Title 'Histogram of Horsepower after Log Transformation';
Proc sGplot Data = log_test;
Histogram loghp;
Density loghp;
Density loghp/ type=kernel;
Run;
Title 'QQ-Plot of Horsepower after Log Transformation';
Proc Univariate Data = log_test plots;
Var Loghp;
Run;
Title 'Listing First 5 Observations from Final Dataset';
Proc Print Data = group5.auto_mpg (obs = 5);
Run;
Data Group5.Auto_Mpg;
Set Group5.Auto_Mpg;
Label Brand = 'Brand of the Vehicle'
Model = 'Model name of vehicle'
Cylinders = 'Number of Cylinders. Categorical Variable which can take following values:
4, 6 or 8'
Year = 'The year in which the vehicle was manufactured'
Origin = 'Country of Origin of the Vehicle Brand. Has the following categories:
Unites States = 1
Germany =2
Japan = 3'
MPG = 'City fuel cycle measured in miles/gallon'
Displacement = 'Engine size of vehicle measured in cubic centimetres(CC)'
Horsepower = 'Horsepower of the vehicle'
Weight = 'Weight of vehicle in lbs'
Acceleration = 'Time taken to reach from 0-60 mph'
PWR = 'Power to weight ratio of vehicle measured as hp/lbs';
Run;
options label;
Proc Contents Data = Group5.Auto_Mpg;
ODS Select variables;
Run;
Proc sgplot data = group5.auto_mpg;
histogram mpg;
density mpg;
density mpg / type = kernel;
Run;
Proc sgplot data = group5.auto_mpg;
reg x = horsepower y = mpg / cli clm;
Run;
Proc sgplot data = group5.auto_mpg;
reg x = weight y = mpg / cli clm;
Run;
Proc sgplot data = group5.auto_mpg;
reg x = pwr y = mpg / cli clm;
Run;
Proc sgplot data = group5.auto_mpg;
reg x = displacement y = mpg / cli clm;
Run;
Proc Univariate Data = group5.auto_mpg plots;
Var mpg;
Run;