analyze-dyhead-predictions.Rmd

---
title: "Analyze DyHead predictions"
output:
  html_document:
    toc: yes
    toc_depth: 3
    toc_float: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Import data

```{r import_data}
library(data.table)

# Generated by video-dyhead.ipynb
df = arrow::read_parquet("data/unlabeled-video-frames.parquet")
setDT(df)
dim(df)
head(df)

class(df)
names(df)[length(names(df))] = "id"

# 102
length(unique(df$video))
summary(df$frame)
```

## Smooth predictions for each video

```{r smooth_preds}
# Create a copy for the smoothed predictions.
df2 = copy(df)

names(df2)

# Columns to smooth
(cols = setdiff(names(df2), c("id", "frame", "video")))

head(df)

# < 1 second
system.time({
  # Smooth by taking the average of object predicted probabilities across 5 frames in a rolling window.
  df2[, (cols) := lapply(.SD, frollmean, n = 5, fill = 0, align = "center", na.rm = TRUE), 
      by = video, .SDcols = cols]
})

arrow::write_parquet(df2,
                     sink = here::here("data/dyhead-scores-smoothed.parquet"))

df3 = copy(df2)

system.time({
  # For each video if the maximum smoothed prob is > the below threshold, we count
  # that video as including the given target class.
  df4 = df3[, lapply(.SD, function(col) {
    as.integer(max(col) > 0.8)
  }), by = video, .SDcols = cols]
})

head(df4)

# Look at the video averages.
# No synthetic nicotine labels found.
colMeans(df4[, 2:10])

arrow::write_parquet(df4,
                     sink = here::here("data/dyhead-videos-smoothed-classified.parquet"))

rio::export(df4, file = here::here("tables/dyhead-videos-smoothed-classified.xlsx"))
```

### Alternative

```{r alt_approach}
system.time({
  # For each video if the maximum smoothed prob is > the below threshold, we count
  # that video as including the given target class.
  df4 = df3[, lapply(.SD, function(col_prob) {
    # How many frames are greater than threshold?
    sum(col_prob > 0.7)
  }), by = video, .SDcols = cols][, lapply(.SD, function(frame_count) {
    as.integer(frame_count > 15)
  }), by = video, .SDcols = cols]
})

rio::export(df4, file = here::here("tables/dyhead-videos-smoothed-classified-alt.xlsx"))
```

### Plot video

```{r plot_video}
head(df3)

# Smooothed data
plot_df = df3 %>% filter(video == "fabio_health_2_and_pod-scored")

plot_df_long = plot_df %>% select(mod, pod, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod"), values_to = "prob", names_to = "class")
ggplot(plot_df_long,
       aes(x = frame, y = prob, color = class)) +
  geom_line() +
  theme_minimal()

# Original data
plot_df = df %>% filter(video == "fabio_health_2_and_pod-scored")

plot_df_long = plot_df %>% select(mod, pod, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod"), values_to = "prob", names_to = "class")
ggplot(plot_df_long,
       aes(x = frame, y = prob, color = class)) +
  geom_line() +
  theme_minimal()


# Try a 10-frame rolling mean.
df2_10 = copy(df)
system.time({
  # Smooth by taking the average of object predicted probabilities across 5 frames in a rolling window.
  df2_10[, (cols) := lapply(.SD, frollmean, n = 10, fill = 0, align = "center", na.rm = TRUE), 
      by = video, .SDcols = cols]
})

plot_df = df2_10 %>% filter(video == "fabio_health_2_and_pod-scored")

plot_df_long = plot_df %>% select(mod, pod, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod"), values_to = "prob", names_to = "class")
ggplot(plot_df_long,
       aes(x = frame, y = prob, color = class)) +
  geom_line() +
  theme_minimal()

# Try a 20-frame rolling mean.
df2_20 = copy(df)
system.time({
  # Smooth by taking the average of object predicted probabilities across 5 frames in a rolling window.
  df2_20[, (cols) := lapply(.SD, frollmean, n = 20, fill = 0, align = "center", na.rm = TRUE), 
      by = video, .SDcols = cols]
})

plot_df = df2_20 %>% filter(video == "fabio_health_2_and_pod-scored")

plot_df_long = plot_df %>% select(mod, pod, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod"), values_to = "prob", names_to = "class")
ggplot(plot_df_long,
       aes(x = frame, y = prob, color = class)) +
  geom_line() +
  theme_minimal()

# 40-frame
df2_40 = copy(df)
system.time({
  # Smooth by taking the average of object predicted probabilities across 5 frames in a rolling window.
  df2_40[, (cols) := lapply(.SD, frollmean, n = 40, fill = 0, align = "center", na.rm = TRUE), 
      by = video, .SDcols = cols]
})

plot_df = df2_40 %>% filter(video == "fabio_health_2_and_pod-scored")

plot_df_long = plot_df %>% select(mod, pod, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod"), values_to = "prob", names_to = "class")
ggplot(plot_df_long,
       aes(x = frame, y = prob, color = class)) +
  geom_line() +
  theme_minimal()

######################
###################### 

# Compare two videos.
# 
# # First the raw probabilities.

# Another video
(p1 = df2 %>%
  #filter(video == "chamillioneyes_ecigs_2-scored") %>%
  filter(video == "fabio_health_2_and_pod-scored") %>%
  select(mod, pod, `e-juice`, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod", "e-juice"), values_to = "prob", names_to = "class") %>%
ggplot(aes(x = frame, y = prob, color = class)) +
  geom_line() + theme_minimal() + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 1)) +
    labs(x = "Video frame",
         y = "Predicted class probability"))

(p2 = df2 %>%
  filter(video == "chamillioneyes_ecigs_2-scored") %>%
  #filter(video == "fabio_health_2_and_pod-scored") %>%
  select(mod, pod, `e-juice`, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod", "e-juice"), values_to = "prob", names_to = "class") %>%
ggplot(aes(x = frame, y = prob, color = class)) +
  geom_line() + theme_minimal() + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 1)) +
    labs(x = "Video frame",
         y = "Predicted class probability"))

cowplot::plot_grid(p1 + theme(axis.title.x = element_blank()) +
                     guides(color = "none"),
                   p2 + theme(legend.position = "top"),
                   labels = c("A.", "B."), nrow = 2)

#####
# Now 10-frame smooths.
(p1 = df2_10 %>%
  #filter(video == "chamillioneyes_ecigs_2-scored") %>%
  filter(video == "fabio_health_2_and_pod-scored") %>%
  select(mod, pod, `e-juice`, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod", "e-juice"), values_to = "prob", names_to = "class") %>%
ggplot(aes(x = frame, y = prob, color = class)) +
  geom_line() + theme_minimal() + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 1)) +
    labs(x = "Video frame",
         y = "Predicted class probability"))

(p2 = df2_10 %>%
  filter(video == "chamillioneyes_ecigs_2-scored") %>%
  #filter(video == "fabio_health_2_and_pod-scored") %>%
  select(mod, pod, `e-juice`, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod", "e-juice"), values_to = "prob", names_to = "class") %>%
ggplot(aes(x = frame, y = prob, color = class)) +
  geom_line() + theme_minimal() + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 1)) +
    labs(x = "Video frame",
         y = "Predicted class probability"))

(p3 = df2_10 %>%
  filter(video == "calitrickzz_mj_8-scored") %>%
  select(mod, pod, `e-juice`, frame) %>%
  tidyr::pivot_longer(cols = c("mod", "pod", "e-juice"), values_to = "prob", names_to = "class") %>%
ggplot(aes(x = frame, y = prob, color = class)) +
  geom_line() + theme_minimal() + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 1)) +
    labs(x = "Video frame",
         y = "Predicted class probability"))


cowplot::plot_grid(p1 + theme(axis.title.x = element_blank()) +
                     guides(color = "none"),
          p2 + theme(axis.title.x = element_blank()) +
                     guides(color = "none"),
                   p3 + theme(legend.position = "top"),
                   labels = c("A.", "B.", "C."), nrow = 3, scale = 0.95)
```

### Revised analysis

```{r alt_2}

system.time({
  # For each video if the maximum smoothed prob is > the below threshold, we count
  # that video as including the given target class.
  df4 = df2_10[, lapply(.SD, function(col_prob) {
    # How many frames are greater than threshold?
    sum(col_prob > 0.6)
  }), by = video, .SDcols = cols][, lapply(.SD, function(frame_count) {
    as.integer(frame_count >= 3)
  }), by = video, .SDcols = cols]
})

rio::export(df4, file = here::here("tables/dyhead-videos-smoothed-classified-alt-v2.xlsx"))
```

## Integrate metadata

```{r plots}
df4 = rio::import(here::here("data/dyhead-videos-smoothed-classified.parquet"))
head(df4)

video_info = rio::import("/vape/collection/appended_scrape_download/TikTok/Influencers/analytic_sample_17361_20192022.csv")
dim(video_info)
head(video_info)

# Take out the .mp4 ending so that this can be merged with df4.
video_info$video_shortcode = gsub(".mp4$", "", video_info$video_shortcode)
# Integrate the date info.
df5 = merge(df4, video_info[, c("video_shortcode", "date_tot")], by.x = "video", by.y = "video_shortcode")
head(df5)
str(df5)
```

## Plots

### Subfig_a

```{r subfig_a}
library(dplyr)
df_means = colMeans(df5 %>% select(-c(video, date_tot, `synthetic nicotine label`)))
library(ggplot2)
library(tidyr)

(plot_df = data.frame(class = names(df_means), prevalence = unname(df_means)) %>%
    arrange(prevalence))

# Convert to a factor so that ggplot doesn't rearrange alphabetically :/
plot_df$class = factor(plot_df$class, levels = plot_df$class, labels = stringr::str_to_sentence(plot_df$class))

plot_df$class = forcats::fct_recode(plot_df$class,
                                    "Packaging box" = "Box",
                                    "E-juice flavor name" = "E-juice flavor",
                                    "Nicotine warning label" = "Warning label nicotine")

plot_df$label = scales::percent(plot_df$prevalence, accuracy = 1)
plot_df

(plot_a = ggplot(plot_df, aes(x = class, y = prevalence)) +
  geom_col() +
  coord_flip() +
  theme_minimal(base_size = 13) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "Object class", y = "Prevalence") + 
  geom_text(aes(label = label), hjust = +1.2, color = "white", size = 5, fontface = "bold") +
  theme(axis.title.y = element_blank(),
        panel.grid.major.y = element_blank()))
```

### Subfig_b

```{r subfig_b}

plot_df = df5 %>% select(-c(video, `synthetic nicotine label`, `smoke cloud`)) %>%
  mutate(year = year(date_tot)) %>%
  group_by(year) %>%
  mutate(n = n(),
        across(box:`warning label nicotine`, mean)) %>%
  filter(row_number() == 1) %>%
  select(-date_tot)
  
head(plot_df)

plot_df2 = plot_df %>%
  select(-n, -box) %>%
  tidyr::pivot_longer(cols = `e-cigarette brand name`:`warning label nicotine`,
                      values_to = "prevalence") %>%
  mutate(label = scales::percent(prevalence, accuracy = 1))
         
head(plot_df2)

plot_end_labels = plot_df2 %>% filter(year == 2022) %>%
  mutate(label = stringr::str_to_sentence(name),
         year = year)# + 0.5)

plot_end_labels

# Examples at https://ggrepel.slowkow.com/articles/examples.html
library(ggrepel)
(plot_b = ggplot(plot_df2, aes(x = year, y = prevalence, color = name)) +
    geom_point() +
  geom_line() + theme_minimal(base_size = 14) +
    scale_x_continuous(breaks = seq(2019, 2022)) +
    #scale_x_date(labels = unique(plot_df$year)) +
    scale_y_continuous(labels = scales::percent_format()) +
    ggrepel::geom_label_repel(aes(label = label), fontface = "bold",
                              min.segment.length = 100,
                              seed = 1,
                              direction = "y") +#,
                              #min.segment.length = Inf) +
    theme(panel.grid = element_blank(),
          #panel.grid.minor.y = element_blank(),
          plot.margin = margin(0.5, 5, 0.5, 0.5, "cm"),  # Give a good right margin for the line labels
          legend.position = "none") +
    ggrepel::geom_text_repel(data = plot_end_labels, aes(label = label), fontface = "bold",
                             # nudge_x = 0.8,
                             #hjust = -1,
                             #hjust = -0.2,
                              min.segment.length = Inf,
                             seed = 1,
                              # Keep labels in right margin
                             xlim = c(max(plot_df2$year, na.rm = TRUE) + 0.1, Inf)) +
#    xlim(c(min(plot_df2$year), max(plot_df2$year))) +
    labs(x = "Year", y = "Prevalence") +
     # Need this for the line labels
    coord_cartesian(clip = "off"))

cowplot::plot_grid(plot_a, plot_b, ncol = 2, labels = c("A.", "B."))
ggsave("visuals/figure3.png",
       width = 12, height = 6)
ggsave("visuals/figure3.pdf",
       width = 12, height = 6)
```