With the 20th anniversary of Hybrid Theory by Linkin Park, I decided to put together a data package that contains a few different datasets about the band.
You can install the package using the remotes package
remotes::install_github('delabj/linkinpark')
This is a relatively small data set containing data about peak positions on The Billboard Hard Rock Chart. This data includes the album name, peak position reached, the date it peaked and the number of weeks the album charted.
billboard_albums %>%
head() %>%
knitr::kable()
album_title | peak_position | date_peaked | weeks_on_chart | chart_name |
---|---|---|---|---|
Hybrid Theory | 2 | 2002-01-11 | 216 | Album Sales |
Meteora | 1 | 2003-01-11 | 113 | Album Sales |
Minutes To Midnight | 1 | 2007-06-01 | 96 | Album Sales |
A Thousand Suns | 1 | 2010-10-01 | 39 | Album Sales |
Living Things | 1 | 2012-07-13 | 36 | Album Sales |
Live In Texas | 23 | 2003-12-05 | 35 | Album Sales |
This dataset is a collection of awards from the Recording Industry Association of America that were given to Linkin Park. The RIAA certifies an album as gold after 500,000 sales, platinum with 1,000,000 sales, and then diamond after 10,000,000 sales. This data is sourced from the RIAA gold/platinum search. Their database however does not include diamond awards due to their rarity. For certification purposes a sale is a unit equivalent to one of the following:
-
Sale of a digital of physical album
-
10 individual track downloads
-
1500 instances of streamed audio of video from the album
Variable | Description |
---|---|
album_title | Name of the album |
release_date | Date of album release |
format | Type of release (album or single) |
certification | Certification level (gold, platinum) |
certification_date | Date of certification |
plat_modifer | Number of times the album has reached platinum |
This data comes from a poll on [/r/linkinpark](http://www.reddit.com/r/linkinpark) using allourideas where users selected their favorite song in head to head pairings. There were over 47,000 votes and this was the official results as of 2020-07-02.
Variable | Description |
---|---|
Rank | Ranking of the song |
Track | Name of the track |
Album | Album the track appeared on |
score | The Percentage of wins of a given song |
A data set created by using the {spotifyr
} package to access the
spotify API’s audio features for all Linkin Park Content on spotify.
This is provided as a connivance instead of a user set up a spotify API
workflow.
Using the billboard data for charted albums we can easily make a table with {gt}
billboard_albums %>%
arrange(date_peaked) %>%
gt::gt() %>%
cols_label(album_title = "Album",
peak_position = "Peak Position",
date_peaked = "Date Peaked",
weeks_on_chart = "Weeks Charted") %>%
tab_header(md("**Linkin Park On Billboard's Hard Rock Chart**"))%>%
data_color(
columns = vars(weeks_on_chart),
colors = scales::col_numeric(
c("white", "#F39524"),
domain = NULL
)
) %>%
fmt_date(columns = vars(date_peaked), date_style = 7) %>%
cols_align(align = "right", columns = vars(date_peaked)) %>%
opt_table_lines(extent = "default") %>%
tab_options(
column_labels.border.top.color = "white",
column_labels.border.top.width = px(3),
column_labels.border.bottom.color = "#90998A",
table_body.hlines.color = "white",
table.border.bottom.color = "white",
table.border.bottom.width = px(3),
table.border.top.color = "white",
table.margin.left = px(20)
) %>%
opt_align_table_header(align= "left") %>%
tab_style(
style = list(
cell_text(
font = google_font("Montserrat"),
size = px(25),
align = "left"
)
),
locations = list(
cells_title(groups = "title")
)
) %>%
tab_source_note("Data: Billboard.com" )
Using the spotify audio features we can see how different aspects of the bands sound evolved over time
audio_features %>%
filter(album_name %in%
genius_lycics$album) %>% distinct() %>%
select(
track_name,
album_name,
album_release_date,
danceability,
energy,
speechiness,
acousticness,
valence
) %>%
mutate(album_release_date = lubridate::ymd(album_release_date),
age = lubridate::ymd("2020-10-01") - album_release_date,
age = as.numeric(age, units = "days")/365)%>%
pivot_longer(
names_to = "measure",
values_to = "values",
cols = c(danceability,
energy,
speechiness,
acousticness,
valence)
) %>%
group_by(
track_name,
album_name,
age,
measure
)%>%
# There are duplicate listings based on region
# and they can vary on values slightly
summarise(values = median(values)) %>%
ungroup() %>%
mutate(measure = factor(measure, levels = c("valence", "acousticness", "speechiness", "danceability",
"energy"
))) %>%
filter(str_detect(tolower(album_name), "live", TRUE)) %>%
ggplot(aes(x=measure, y = values))+
geom_line(aes(color = age, group = track_name), alpha = .3, size= .02)+
geom_point(aes(color = age))+
theme_delabj()+
scale_color_delabj("zune", discrete = FALSE)+
guides(
color = guide_colorbar(title = "Track Age",
title.position = "top",
title.hjust = .5,
barwidth = 15
))+
labs(
title = "Linkin Park's Audio Features",
y=NULL)
Or we can look at these by just the albums for a slightly less chaotic plot
release_dates <- audio_features %>%
filter(album_type=="album") %>%
group_by(album_name, ) %>%
summarize(album_release_date = max(album_release_date)) %>%
filter(str_detect(tolower(album_name), "live", TRUE)) %>%
filter(str_detect(tolower(album_name), "acapellas", TRUE)) %>%
filter(album_name %in%
genius_lycics$album) %>% distinct() %>%
arrange(album_release_date)
audio_features %>%
filter(album_type=="album") %>%
group_by(track_name, album_name) %>%
summarize(energy=median(energy)) %>%
filter(str_detect(tolower(album_name), "live", TRUE)) %>%
filter(str_detect(tolower(album_name), "acapellas", TRUE)) %>%
filter(album_name %in%
genius_lycics$album) %>%
mutate(album_name = factor(album_name, levels = release_dates$album_name)) %>%
ggplot(aes(x=album_name, y = energy))+
geom_boxplot(aes(color = album_name,
fill = album_name),
alpha = .3)+
geom_point(aes(color =album_name), position = "jitter")+
theme_delabj()+
scale_color_delabj("retro")+
scale_fill_delabj("retro")+
labs(
title = "The Energy of Linkin Park Albums",
y=NULL,
x=NULL,
color = "Album Name",
fill = "Album Name")+
legend_none()+
scale_x_discrete(guide = guide_axis(n.dodge = 2))
We can also look at the RIAA certifications
riaa_lp %>%
na.omit() %>%
dplyr::filter(format == "Album") %>%
mutate(size = case_when(
certification == "Gold" ~ 500000,
TRUE ~ 1000000
)) %>%
ggplot()+
geom_point(aes(x=certification_date,
y = fct_reorder(album_title, release_date),
size = size, color = album_title),
alpha = .75)+
geom_vline(aes(xintercept = lubridate::ymd("2017-07-20"), alpha = .5))+
theme_delabj()+
scale_color_delabj()+
scale_size_continuous(range = c(4, 8))+
labs(
title = "Linkin Park Timeline of Awards",
subtitle = "After Chester Died there was a resurgance of purchaces",
y = NULL,
x = "Date")+
legend_none()+
geom_text(data = data.frame(x = as.Date("2015-04-06"),
y = 8.32247240968638,
label = "Chester's Death"),
mapping = aes(x = x,
y = y,
label = label),
angle = 0L,
lineheight = 1L,
hjust = 0.5,
vjust = 0.5,
colour = "black",
family = "sans",
fontface = "plain",
inherit.aes = FALSE,
show.legend = FALSE)+
geom_curve(data = data.frame(x = as.Date("2014-10-21"),
y = 7.83474315204446,
xend = as.Date("2017-06-08"),
yend = 4.55069948392219),
mapping = aes(x = x,
y = y,
xend = xend,
yend = yend),
angle = 90L,
colour = "black",
curvature = 0.5,
arrow = structure(list(angle = 30,
length = structure(0.1,
class = "unit",
valid.unit = 2L,
unit = "inches"),
ends = 2L,
type = 2L),
class = "arrow"),
inherit.aes = FALSE,
show.legend = FALSE)+
theme(axis.text.y = element_text(hjust = 0))