-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPCA.R
130 lines (98 loc) · 3.32 KB
/
PCA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library("plot3D")
#load dataset
dataset <- read.csv('usa_final.csv', sep=',')
rownames(dataset)<-dataset[,1]
dataset <- dataset[,2:19]
dataset <- scale(dataset)
n <- nrow(dataset)
p <- ncol(dataset)
#generate mean and standard deviation
M <- colMeans(dataset)
sigma <- apply(dataset,2,sd)
descriptive<-round(cbind(M,sigma),2)
descriptive
#create correlation matrix
rho <- cor(dataset)
autoval <- eigen(rho)$values
autovec <- eigen(rho)$vectors
pvarsp = autoval/p
pvarspcum = cumsum(pvarsp)
pvarsp
tab<-round(cbind(autoval,pvarsp*100,pvarspcum*100),3)
colnames(tab)<-c("eigenval","%var","%cumvar")
tab
#scree diagram to select the components
plot(autoval,type="b",main="Scree Diagram", xlab="Number of Component", ylab="Eigenvalues")
abline(h=1,lwd=3,col="red")
#get only the significant eigenvectors
#The main issue in this analysis is the forth component:
#in fact, it has a significant explanatory importance
#since the eigenvalue is above 1
#and it moves the cumulative variance explained from (82% to 88%).
#On the other hand, using four components would be a problem when we plot the results.
#The analysis will be split into two parts: the first one will deal with four components
#while the second one will use only the first three of them in order to visualize the results.
#part 1 - four components
eigen(rho)$vectors[,1:4]
#we investigate what is explained by the components
comp<-round(cbind(
-eigen(rho)$vectors[,1]*sqrt(autoval[1]),
-eigen(rho)$vectors[,2]*sqrt(autoval[2]),
-eigen(rho)$vectors[,3]*sqrt(autoval[3]),
-eigen(rho)$vectors[,4]*sqrt(autoval[4])
),3)
rownames(comp)<-row.names(descriptive)
colnames(comp)<-c("comp1","comp2","comp3","comp4")
comp
communality<-comp[,1]^2+comp[,2]^2+comp[,3]^2+comp[,4]^2
comp<-cbind(comp,communality)
comp
#part 2 - three components
eigen(rho)$vectors[,1:3]
#we investigate what is explained by the components
comp<-round(cbind(
-eigen(rho)$vectors[,1]*sqrt(autoval[1]),
-eigen(rho)$vectors[,2]*sqrt(autoval[2]),
-eigen(rho)$vectors[,3]*sqrt(autoval[3])
),3)
rownames(comp)<-row.names(descriptive)
colnames(comp)<-c("comp1","comp2","comp3")
communality<-comp[,1]^2+comp[,2]^2+comp[,3]^2
comp<-cbind(comp,communality)
comp
#calculate the scores for the selected components and graph them
#calculate components for each unit
score <- dataset%*%autovec[,1:3]
round(score,3)
#score plot
scorez<-round(cbind
(-score[,1]/sqrt(autoval[1]),
score[,2]/sqrt(autoval[2]),
score[,3]/sqrt(autoval[3])),2)
x <- scorez[,1]
y <- scorez[,2]
z <- scorez[,3]
#plots
scatter3D(x, y, z, colvar = dataset[,1],
xlab = "comp1", ylab = "comp2", zlab = "comp3",
main = "PCA - 3 components",
bty = "g", ticktype = "detailed", d = 2,
theta = 60, phi = 20, col = gg.col(100),
clab = "aqi",
pch = 19, cex = 0.8)
text3D(x,y,z,labels = rownames(dataset),add = TRUE,
cex = 0.5, col = gg.col(100),
adj = 0.5, font = 1)
compz<-round(cbind
(-comp[,1]/sqrt(autoval[1]),
comp[,2]/sqrt(autoval[2]),
comp[,3]/sqrt(autoval[3])),2)
x <- compz[,1]
y <- compz[,2]
z <- compz[,3]
scatter3D(x,y,z,
col = "red",add = TRUE,
pch = 19, cex = 0.8)
text3D(x,y,z,labels = row.names(comp),add = TRUE,
cex = 0.5, col = "red",
adj = 0.5, font = 1)