-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAssignment1_Clustering.sas
109 lines (83 loc) · 2.79 KB
/
Assignment1_Clustering.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
1. Import data into SAS work folder
2. Filter VIRGINIA data.
3. Filter outliers and blank rows(PIN-0 or 99999)
4. Make categories of income(6 categories)
5. Perform clustering (a)Hirerchial (b)K-Means
*/
PROC IMPORT OUT= WORK.Income_byZIP_2015
DATAFILE= "C:\Users\sanja\Google Drive\2ndSem\BIA672_Marketi
ngAnalytics_KashaDehnad\RAW-Data\15zpallagi.csv"
DBMS=CSV REPLACE;
GETNAMES=YES;
DATAROW=2;
RUN;
data Income_2015_VA Income_2015_rest junk;
keep STATEFIPS zipcode State agi_stub N1;
set Income_byzip_2015;
if zipcode = 0 or zipcode = 99999 then output junk;
else if state = "VA" then output Income_2015_VA;
else output Income_2015_rest;
run;
proc format;
value agifmt
1 = '$1 under $25,000 '
2 = '$25,000 under $50,000 '
3 = '$50,000 under $75,000 '
4 = '$75,000 under $100,000 '
5 = '$100,000 under $200,000'
6 = '$200,000 or more '
other='Unkown '
;
run;
proc sort data=income_2015_va;by zipcode;run;
data agi1 agi2 agi3 agi4 agi5 agi6 empty;
set income_2015_va(rename=(n1=Returns));
if AGI_STUB=1 then output agi1;
else if AGI_STUB=2 then output agi2;
else if AGI_STUB=3 then output agi3;
else if AGI_STUB=4 then output agi4;
else if AGI_STUB=5 then output agi5;
else if AGI_STUB=6 then output agi6;
else output empty;
run;
data agi_all;
merge agi1(rename= (Returns=Returns1) drop=AGI_STUB)
agi2(rename= (Returns=Returns2) drop=AGI_STUB)
agi3(rename= (Returns=Returns3) drop=AGI_STUB)
agi4 (rename= (Returns=Returns4) drop=AGI_STUB)
agi5 (rename= (Returns=Returns5) drop=AGI_STUB)
agi6 (rename= (Returns=Returns6) drop=AGI_STUB) ;
by STATEFIPS state zipcode;
run;
data zip_income_pct;
drop i;
set agi_all;
array Returns_pcts {6} Returns_pct1 - Returns_pct6 ;
array Returns Returns1 - Returns6 ;
total=sum(of Returns1 - Returns6);
do i=1 to 6;
Returns_pcts[i]=round((Returns[i]/total)*100,.01);
end;
run;
proc fastclus data =zip_income_pct
maxclusters =10 out=kmeans_zip_income_pct;
var Returns_pct1 Returns_pct2 Returns_pct3 Returns_pct4 Returns_pct5 Returns_pct6 ;
id zipcode ;
run;
proc sort data=kmeans_zip_income_pct;
by cluster;
run;
proc print data=kmeans_zip_income_pct;
by cluster;
*var zipcode Returns_pct1 Returns_pct2 Returns_pct3 Returns_pct4 Returns_pct5 Returns_pct6;
run;
proc cluster data = zip_income_pct outtree=hirerchial_zip_income_pct method=SINGLE ;
var Returns_pct1 Returns_pct2 Returns_pct3 Returns_pct4 Returns_pct5 Returns_pct6 ;
id zipcode ;
run;
proc tree data = hirerchial_zip_income_pct ;
run;
proc tree noprint ncl=10 out=out;
copy zipcode;
run;