-
Notifications
You must be signed in to change notification settings - Fork 0
/
genetic_variation.R
148 lines (101 loc) · 3.05 KB
/
genetic_variation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
---
# SNP and Genetic Variation
# Download data
data <- read.csv ('/Users/sanzidaakhteranee/Documents/HackBio_Contest/Phase_2/Data.csv', header=TRUE, sep =',')
print(data)
```
# General statistics
#one sample t-test for minor allele frequency
t.test (data$EFFECT_ALLELE_FREQ, value=0.01, alternative ='greater')
#Report: general statistics
Null Hypothesis = SNP p value = 0.01
Alt Hypothesis = height variability >0.01
so, p value less than 0.01 means we can reject null hypothesis and
accept alternative hypothesis
#SNP significant p value over all populations
SNP = unique(data$SNPID) #split data based on SNPID
new_data <- data.frame() #create new data frame
for (i in SNP){
subset_data <- data[data$SNPID == i, ] # create subset_data for looping SNPID iteration
if (!any(is.na(subset_data$P)) && !any(is.na(subset_data$EFFECT_ALLELE_FREQ))) {
if (any(subset_data$P < 0.01) && any(subset_data$EFFECT_ALLELE_FREQ > 0.01)){
new_data <- rbind(new_data, subset_data) # combind two data together
} }
}
print(new_data)
View(new_data)
```
#Report for SNP p value
From above code, it shows total 2281 observations that are significantly
different SNP among all super populations
# Data partitions among 5 different populations
partitions <- split(data, data$ANCESTRY)
print(partitions)
```
# PCA analysis for EUROPEAN populations
population=partitions$EUROPEAN
population
my_data <- data.frame (geno=data$EFFECT_ALLELE, pop='population')
my_data
my_data <- matrix(rnorm(2500), ncol =2)
# Perform PCA
pca_result <- prcomp(my_data, scale. = TRUE)
# Summary of PCA results
summary(pca_result)
# Biplot for visualization
biplot(pca_result)
```
# PCA analysis for AFRICAN populations
```{r}
population=partitions$AFRICAN
population
my_data <- data.frame (geno=data$EFFECT_ALLELE, pop='population')
my_data
my_data <- matrix(rnorm(2500), ncol =2)
# Perform PCA
pca_result <- prcomp(my_data, scale. = TRUE)
# Summary of PCA results
summary(pca_result)
# Biplot for visualization
biplot(pca_result)
```
# PCA analysis for SOUTH ASIA populations
population=partitions$SOUTH_ASIA
population
my_data <- data.frame (geno=data$EFFECT_ALLELE, pop='population')
my_data
my_data <- matrix(rnorm(2500), ncol =2)
# Perform PCA
pca_result <- prcomp(my_data, scale. = TRUE)
# Summary of PCA results
summary(pca_result)
# Biplot for visualization
biplot(pca_result)
```
$PCA analysis for EAST ASIA populations
```{r}
population=partitions$EAST_ASIA
population
my_data <- data.frame (geno=data$EFFECT_ALLELE, pop='population')
my_data
my_data <- matrix(rnorm(2500), ncol =2)
# Perform PCA
pca_result <- prcomp(my_data, scale. = TRUE)
# Summary of PCA results
summary(pca_result)
# Biplot for visualization
biplot(pca_result)
```
#PCA analysis for HISPANIC populations
```{r}
population=partitions$HISPANIC
population
my_data <- data.frame (geno=data$EFFECT_ALLELE, pop='population')
my_data
my_data <- matrix(rnorm(2500), ncol =2)
# Perform PCA
pca_result <- prcomp(my_data, scale. = TRUE)
# Summary of PCA results
summary(pca_result)
# Biplot for visualization
biplot(pca_result)