2) ProPer preparation (Praat-to-R).Rmd

# ProPer preparation (II): Praat-to-R

Collect data from Praat objects into an R dataframe.

```{r clean_start, warning=FALSE}
rm(list = ls())

## Load required libraries 
require(rPraat)
require(tidyverse)

##### Paths
files_intensity <- list.files(path="praat_data/intensity_tiers/", pattern="*.IntensityTier",full.names=T)
##
files_pitchTier <- list.files(path="praat_data/pitch_tiers/", pattern="*.PitchTier",full.names=T)
##
files_pitchObject <- list.files(path="praat_data/pitch_objects/", pattern="*.Pitch",full.names=T)
##
files_textGrid <- list.files(path="praat_data/textgrids/", pattern="*.TextGrid",full.names=T)

```

# Harvest acoustic data

```{r pre_prepare_raw_df, warning=FALSE}

##### Full-time 
# (get the full time table of each audio file based on its intensity tier)
fullTime_df <- plyr::ldply(files_intensity, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  filenameBits <- str_match(f,".*/([^/._]*)_([^/._]*)\\.[^/]*$")
  speaker <- filenameBits[,2]
  variable <- filenameBits[,3]
  it <- it.read(f)
  time <- seq(it[["tmin"]], it[["tmax"]], 0.001) * 1000
  data.frame(file, t=as.integer(as.character(time)), speaker, variable)
})

##### Intensity 
# (get intensity data)
intensity_df <- plyr::ldply(files_intensity, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  intensitier <- it.read(f)
  time = round(intensitier$t,3)*1000
  data.frame(file, t=as.integer(as.character(time)), intensity = round(intensitier$i,4))
})

# take a look:
# IntensEx <- it.read(files_intensity[1])

##### F0: Pitch Tier 
# (get the smooth F0 curve)
f0_smooth_df <- plyr::ldply(files_pitchTier, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  pt <- pt.read(f)
  time = round(pt[["t"]],3)*1000
  f0 = pt[["f"]]
  data.frame(file, t=as.integer(as.character(time)), f0_smooth=round(f0,2)) 
})

# take a look:
# PitchTierExample <- pt.read(files_pitchTier[1])

##### Pitch object  
# get the Strength, i.e. the *similarity index* or *periodic fraction* from Praat's autocorrelation. Also, get the raw F0
# Note: this may take longer to process!!!
pitchObject_df <- plyr::ldply(files_pitchObject, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  pitch_object <- pitch.read(f)
  time <- round(pitch_object$t,3)*1000
  pitch_ceiling <- 1000 #fixed to periods up to 1000Hz
  ###### row1 = Praat's path finder choice for F0 candidates 
  f0_row1 <- apply(pitch.toArray(pitch_object)[["frequencyArray"]], 2, function(x) x[1])
  f0_row1[which(f0_row1==0)] <- NA
  strengthArray <- apply(as.data.frame(pitch.toArray(pitch_object)[["strengthArray"]]), 2, function(x) ifelse(x==0,NA,x))
  freqArray <- apply(as.data.frame(pitch.toArray(pitch_object)[["frequencyArray"]]), 2, function(x) ifelse(x==0,NA,x))
  zero_one_freqs <- apply(freqArray, 2, function(x) ifelse(x>pitch_ceiling, 0, 1))
  strength_limited <- strengthArray
  strength_limited[, -1] <- mapply(`*`, strengthArray[, -1], zero_one_freqs[, -1])
  ###### rowmax = highest strength value within the frequency range (up to 'pitch_ceiling')
  strength_rowmax <- apply(strength_limited, 2, max, na.rm=T)
  strength_rowmax[is.infinite(strength_rowmax)] <- 0
  data.frame(file, t=as.integer(as.character(time)), f0_raw=round(f0_row1,2), periodicStrength=round(strength_rowmax,7))
})

# take a look:
# PitchObEx <- pitch.read(files_pitchObject[1])

```

# Read TextGrids: 'Syllable' tier (optional, useful for anlyses and exposition)

```{r pre_prepare_TextGrids_Syllable, warning=FALSE}

##### TextGrid
# TextGrids are optional! They are useful for exposition and to improve the automatic detection. By default, TextGrids are expected with at least one interval tier demarcating syllabic boundaries
#
# The following chunk takes syllabic intervals and labels from the "Syllable" tier
if(length(files_textGrid)>0) textGridSyll_df <- plyr::ldply(files_textGrid, function(f){
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2]
  tg <- tg.read(f, encoding = "auto")
  syll_tier <- data.frame(tg$Syllable)
  t1 <- ifelse(syll_tier$label=="", NA, round(syll_tier$t1,3)*1000)
  t2 <- ifelse(syll_tier$label=="", NA, round(syll_tier$t2,3)*1000)
  t_mid <- round((t1+t2)/2)
  syll_label <- syll_tier$label
  data.frame(file, t=as.integer(as.character(t1)), syll_start=as.integer(as.character(t1)), syll_mid=as.integer(as.character(t_mid)), syll_end=as.integer(as.character(t2)), syll_bounds=as.integer(as.character(t1)), syll_label)
})
if(exists("textGridSyll_df")) textGridSyll_df <- mutate(
  group_by(textGridSyll_df,file),
  syll_bounds = ifelse(
    (is.na(syll_bounds) & !is.na(lag(syll_end,1))),
    lag(syll_end,1), 
    syll_bounds),
  t = syll_bounds
  )
#
if(exists("textGridSyll_df")) textGridSyll_df <- dplyr::filter(textGridSyll_df, !is.na(t))

# take a look:
# TexGriEx <- tg.read(files_textGrid[1], encoding = "auto")

```

# Read TextGrids: 'Word' tier (optional, useful for exposition purposes only)

```{r pre_prepare_TextGrids_Word, warning=FALSE}

# The following chunk takes word intervals and labels from the "Word" tier (for exposition purposes)
if(length(files_textGrid)>0) textGridWord_df <- plyr::ldply(files_textGrid, function(f){
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2]
  tg <- tg.read(f, encoding = "auto")
  word_tier <- data.frame(tg$Word)
  word_t1 <- ifelse(word_tier$label=="", NA, round(word_tier$t1,3)*1000)
  word_t2 <- ifelse(word_tier$label=="", NA, round(word_tier$t2,3)*1000)
  word_t_mid <- round((word_t1+word_t2)/2)
  word_label <- word_tier$label
  data.frame(file, t=as.integer(as.character(word_t1)), word_start=as.integer(as.character(word_t1)), word_mid=as.integer(as.character(word_t_mid)), word_end=as.integer(as.character(word_t2)), word_bounds=as.integer(as.character(word_t1)), word_label)
})
if(exists("textGridWord_df")) textGridWord_df <- mutate(
  group_by(textGridWord_df,file),
  word_bounds = ifelse(
    (is.na(word_bounds) & !is.na(lag(word_end,1))),
    lag(word_end,1), 
    word_bounds),
  t = word_bounds
  )
#
if(exists("textGridWord_df")) textGridWord_df <- dplyr::filter(textGridWord_df, !is.na(t))
  
```

# Combine data into raw_df

```{r prepare_raw_df, warning=FALSE}

##### Combine all data
raw_df <- left_join(fullTime_df, f0_smooth_df, by = c("file", "t"))
raw_df <- left_join(raw_df, intensity_df, by = c("file", "t"))
raw_df <- left_join(raw_df, pitchObject_df, by = c("file", "t"))
if(exists("textGridSyll_df")) raw_df <- left_join(raw_df, textGridSyll_df, by = c("file", "t"))
if(exists("textGridWord_df")) raw_df <- left_join(raw_df, textGridWord_df, by = c("file", "t"))

```

## Optional chunk to trim your tables in case there are large portions of audio before and after the targets in each file (assuming annotated TextGrids were provided)

```{r trimData}

# require(zoo)

# raw_df <- mutate(
#   group_by(raw_df,file),
#   syll_label = na.locf(syll_label, na.rm=F),
#   syll_label = ifelse(row_number() < min(which(syll_label!=""))-150, "discard", syll_label),
#   syll_label = ifelse(row_number()-1 > max(which(syll_label!=""))+150, "discard", syll_label),
#   syll_label = as.factor(syll_label)
#   )
# 
# raw_df <- droplevels(raw_df[-which(raw_df$syll_label=="discard") , ])

```

# Write the raw_df table

```{r write_raw_df, warning=FALSE}

##### Write the raw data
write.csv(raw_df, "data_tables/raw_df.csv", row.names=FALSE)

```