Skip to content

Latest commit

 

History

History
674 lines (544 loc) · 14 KB

visual_1.md

File metadata and controls

674 lines (544 loc) · 14 KB
title author output
Data visualization
Yohan
html_document
preserve_yaml toc toc_float keep_md
true
true
true
true
## gapminder
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame':	1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
ggplot(data = gapminder, aes(x = year, y = lifeExp,
                             group = country, color = continent)) +
  geom_line(alpha = 0.5) + 
  facet_wrap( ~ continent) +
  xlab("Year") + ylab("Life expectancy") +
  ggtitle("Life expectancy over time") + theme_bw()

ggplot(data = gapminder, aes(x = continent, y = year, color = continent)) +
  geom_point()

ggplot(data = gapminder, aes(x = continent, y = year, color = continent)) +
  geom_point(position = position_jitter(width = 0.5, height = 2))

ggplot(data = gapminder, aes(x = year, y = lifeExp, group = country)) +
  geom_line(alpha = 0.5, aes(color = "Country", size = "Country")) +
  geom_line(stat = "smooth", method = "loess",
            aes(group = continent, color = "Continent", size = "Continent"), 
            alpha = 0.5) +
  facet_wrap(~ continent, nrow = 2) +
  scale_color_manual(name = "Life Exp. for:",
                     values = c("Country" = "black", "Continent" = "dodgerblue1")) +
  scale_size_manual(name = "Life Exp. for:", 
                    values = c("Country" = 0.25, "Continent" = 3)) +
  theme_minimal(base_size = 14) + 
  ylab("Years") + xlab("") + 
  ggtitle("Life Expectancy, 1952-2007", subtitle = "By continent and country") +
  theme(legend.position=c(0.75, 0.2), axis.text.x = element_text(angle = 45))

gapminder %>%
  filter(year == 1952) %>%
  group_by(continent) %>%
  summarize(medianGdpPercap = median(gdpPercap)) %>% 
  ggplot(aes(x = continent, y = medianGdpPercap)) +
  geom_col()

gapminder %>%
  filter(year == 1952) %>% 
  ggplot(aes(x = pop)) +
  geom_histogram() +
  scale_x_log10()

gapminder %>%
  filter(year == 1952) %>% 
  ggplot(aes(x = continent, y = gdpPercap, color = continent)) +
  geom_boxplot() +
  scale_y_log10() +
  ggtitle("Comparing GDP per capita across continents")

## tidy 
str(iris)
## 'data.frame':	150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris.tidy <- iris %>%
  gather(key, Value, -Species) %>%
  separate(key, c("Part", "Measure"), "\\.")

iris$Flower <- 1:nrow(iris)

iris.wide <- iris %>%
  gather(key, value, -Species, -Flower) %>%
  separate(key, c("Part", "Measure"), "\\.") %>%
  spread(Measure, value)


ggplot(iris.wide, aes(x = Length, y = Width, color = Part)) +
  geom_jitter() +
  facet_grid(. ~ Species)

## cluster analysis
ggpairs(iris[,-6],mapping=aes(color=Species))

ggpairs(iris, columns = 1:4, 
        aes(color=Species, alpha=0.4), 
        title="Scatterplot Matrix",
        upper=list(continuous="density", combo="box"),
        lower=list(continuous="smooth", combo="dot")) +
  theme_light() +
  theme(plot.title=element_text(size=10))

ggplot(iris, aes(x=Petal.Length, y=Sepal.Width, colour=Species) ) +
  geom_point(size= 2.5) +
  geom_smooth(method="lm") +             
  labs(title="Aggregated Data")

set.seed(123)
cluster=kmeans(iris[,1:4],3)
iris$cluster=as.factor(cluster$cluster)
ggpairs(iris,columns = 1:5, mapping=aes(color=cluster))

set.seed(456)
performance=c()
for (i in rep(1:100,times=30)) {
  clust=kmeans(iris[,1:4],i)
  performance=c(performance,1-clust$tot.withinss/clust$totss)
}
perf_df=data.frame(metrics=performance,number_of_center=rep(1:100,times=30))
ggplot(perf_df,aes(x=number_of_center,y=metrics)) +
  geom_point(alpha=0.2) +
  geom_vline(xintercept = 3,color='red')

## point 
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':	53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
diamonds %>% 
  ggplot(aes(x = carat, y = price)) +
  geom_point() +
  geom_smooth()

ggplot(diamonds, aes(x = carat, y = price, color = clarity)) +
  geom_point(alpha = 0.4)+
  geom_smooth()

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(alpha = 0.4)+
  geom_smooth(aes(color = clarity))

ggplot(diamonds, aes(x = clarity, y = carat, color = price)) +
  geom_point(alpha = 0.5)

ggplot(diamonds, aes(x = clarity, y = carat, color = price)) +
  geom_point(alpha = 0.5, position = "jitter")

data("mtcars")
str(mtcars)
## 'data.frame':	32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
mtcars <- mtcars %>% 
  mutate_at(vars(cyl, am), factor)
plot(mtcars$wt, mtcars$mpg, col = mtcars$cyl)

lapply(mtcars$cyl, function(x) {
  abline(lm(mpg ~ wt, mtcars, subset = (cyl == x)), col = x)
})
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL
## 
## [[28]]
## NULL
## 
## [[29]]
## NULL
## 
## [[30]]
## NULL
## 
## [[31]]
## NULL
## 
## [[32]]
## NULL
legend(x = 5, y = 33, legend = levels(mtcars$cyl),
       col = 1:3, pch = 1, bty = "n")

ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(aes(group = 1), method = "lm", se = FALSE, linetype = 2)

## aesthetics 
### x, y, color, fill, size, alpha, labels, linetype, shape
### variable: continuous, discrete
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl, col = am)) +
  geom_point(shape = 21, size = 4, alpha = 0.6)

ggplot(mtcars, aes(x = wt, y = mpg, size = cyl)) +
  geom_point()

ggplot(mtcars, aes(x = wt, y = mpg, alpha = cyl)) +
  geom_point()

ggplot(mtcars, aes(x = wt, y = mpg, shape = cyl)) +
  geom_point()

ggplot(mtcars, aes(x = wt, y = mpg, label = cyl)) +
  geom_text()

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_text(label = rownames(mtcars), color = 'red')

ggplot(mtcars, aes(x = mpg, y = qsec, col = cyl, shape = am, 
                   size = (hp/wt))) +
  geom_point()

ggplot(mtcars, aes(x = mpg, y = 0)) +
  geom_jitter() +
  scale_y_continuous(limits = c(-2,2))

ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_jitter(width = 0.1, alpha = 0.6, shape = 1)

ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_point(position = position_jitter(0.1))

### histogram 
ggplot(mtcars, aes(mpg)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "#377EB8")

ggplot(mtcars, aes(mpg, fill = cyl)) +
  geom_histogram(binwidth = 1)

ggplot(mtcars, aes(mpg, fill = cyl)) +
  geom_histogram(binwidth = 1, position = "dodge")

ggplot(mtcars, aes(mpg, color = cyl)) +
  geom_freqpoly(binwidth = 1)

ggplot(mtcars, aes(mpg, fill = cyl)) +
  geom_histogram(binwidth = 1, position = "identity", alpha = 0.4)

### bar
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = "stack")

ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = "fill") +
  scale_fill_brewer()

ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = "dodge")

ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = position_dodge(0.2), alpha = 0.6)

ggplot(mtcars, aes(x = 1, fill = factor(cyl))) +
  geom_bar() +
  coord_polar(theta = "y")

ggplot(mtcars, aes(x = 1, fill = cyl)) +
  geom_bar(width = .1) +
  scale_x_continuous(limits = c(0.5,1.5)) +
  coord_polar(theta = "y")

### facet 
mtcars$cyl_am <- paste(mtcars$cyl, mtcars$am, sep = "_")
myCol <- rbind(brewer.pal(9, "Blues")[c(3,6,8)],
               brewer.pal(9, "Reds")[c(3,6,8)])

ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am)) +
  geom_point() +
  scale_color_manual(values = myCol)

ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am, size = disp)) +
  geom_point() +
  scale_color_manual(values = myCol) +
  facet_grid(gear ~ vs)

### qplot
qplot(wt, mpg, data = mtcars)

qplot(wt, mpg, data = mtcars, size = cyl)

qplot(wt, mpg, data = mtcars, color = hp)

qplot(cyl, factor(vs), data = mtcars)

qplot(cyl, factor(vs), data = mtcars, geom = "jitter")

ggplot(mtcars, aes(cyl, wt, col = am)) +
  geom_point(position = position_jitter(0.2, 0))

ggplot(mtcars, aes(cyl, wt, fill = am)) +
  geom_dotplot(stackdir = "center", binaxis = "y")

qplot(
  cyl, wt,
  data = mtcars,
  fill = am,
  geom = "dotplot",
  binaxis = "y",
  stackdir = "center"
)

head(ChickWeight)
## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
ggplot(ChickWeight, aes(x = Time, y = weight)) +
  geom_line(aes(group = Chick))

ggplot(ChickWeight, aes(x = Time, y = weight, color = Diet)) +
  geom_line(aes(group = Chick))

ggplot(ChickWeight, aes(x = Time, y = weight, color = Diet)) +
  geom_line(aes(group = Chick), alpha = 0.3) +
  geom_smooth(lwd = 2, se = FALSE)


title: "visual_1.R" author: "Yohan_Min" date: "Mon Feb 11 14:48:38 2019"