Merged release/2.1.0 into master

quadrama · May 25, 2018 · ad4d7b3 · ad4d7b3
2 parents de95ebb + e4d2387
commit ad4d7b3
Show file tree

Hide file tree

Showing 24 changed files with 903 additions and 142 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,10 +1,4 @@
 language: r
-cache: packages
 sudo: false
+cache: packages 
 warnings_are_errors: false
-addons:
-  apt:
-    packages:
-      ed
-notifications:
-  slack: quadrama:0jGO6p0xkb5l0vSK00tQSJMr
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DramaAnalysis
 Type: Package
 Title: Scripts to support analysis of dramatic texts
-Version: 2.0.2
-Date: 2018-03-20
+Version: 2.1.0
+Date: 2018-05-25
 Author: Nils Reiter <nils.reiter@ims.uni-stuttgart.de>, Tim Strohmayer <st151528@stud.uni-stuttgart.de>
 Maintainer: Nils Reiter <nils.reiter@ims.uni-stuttgart.de>
 Description: This package can be used to import and analyse pre-processed dramatic texts.
@@ -25,7 +25,8 @@ Depends:
   R (>= 3.0.0)
 RoxygenNote: 6.0.1
 Suggests: testthat,
-    knitr
+    knitr,
+    highcharter
 Encoding: UTF-8
 VignetteBuilder: knitr
 Remotes: bmschmidt/wordVectors@2.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(enrichDictionary)
 export(ensureSuffix)
 export(figureStatistics)
 export(figurematrix)
+export(filterMentioned)
 export(frequencytable)
 export(hamming)
 export(installCollectionData)
@@ -35,6 +36,8 @@ export(rankFiguresByDramatisPersonae)
 export(regroup)
 export(report)
 export(scenicDifference)
+export(setCollectionDirectory)
+export(setDataDirectory)
 export(setup)
 export(tfidf)
 export(utteranceStatistics)

diff --git a/R/dictionaryStatistics.R b/R/dictionaryStatistics.R
@@ -89,9 +89,11 @@ enrichDictionary <- function(dictionary, model, top=100, minimalSimilarity=0.4)
 #' @seealso \code{\link{loadFields}}
 #' @rdname dictionaryStatistics
 #' @examples
+#' \dontrun{
 #' # Check multiple dictionary entries
 #' data(rksp.0)
 #' dstat <- dictionaryStatistics(rksp.0$mtext, fieldnames=c("Krieg","Familie"), names=TRUE)
+#' }
 #' @export
 dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
                                  fieldnames=c("Liebe"),
@@ -132,6 +134,14 @@ dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
                                         names=names, column=column)
     colnames(dss)[ncol(dss)] <- x
     if (x == names(fields)[[1]]) {
+      if (segment=="Scene") {
+        u <- unique(t[,c("begin.Scene","Number.Act", "Number.Scene")])
+        dss <- merge(dss, u, 
+                     by.x="begin.Scene",
+                     by.y="begin.Scene")
+        dss$begin.Scene <- NULL
+        data.table::setcolorder(dss, c("corpus","drama","Number.Act","Number.Scene","figure",x))
+      }
       dss
     } else {
       dss[,x,with=FALSE]
@@ -163,7 +173,7 @@ dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
     rownames(l$mat) <- switch(segment, 
                               Drama=as.character(l$figure),
                               Act=paste(l$figure,utils::as.roman(l$Number.Act)),
-                              Scene=paste(l$figure,utils::as.roman(l$Number.Act),l$Number.Scene))
+                              Scene=paste(l$figure,l$begin.Scene))
     l
   } else {
     r
@@ -209,7 +219,7 @@ dictionaryStatisticsSingle <- function(t, wordfield=c(),
                  switch(segment,
                         Drama=c("drama"),
                         Act=c("drama","Number.Act"),
-                        Scene=c("drama","Number.Act","Number.Scene"))
+                        Scene=c("drama","begin.Scene"))
                  )
   if (byFigure == TRUE) {
     bycolumns <- c(bycolumns, ifelse(names==TRUE,

diff --git a/R/figures.R b/R/figures.R
@@ -11,14 +11,15 @@
 #' if necessary.
 #' @param names If set to true, the table will contains figure names instead of ids
 #' @param normalize Normalising the individual columns
+#' @param segment "Drama", "Act", or "Scene". Allows calculating statistics on segments of the play
 #' @importFrom stats sd
 #' @importFrom stats aggregate
 #' @importFrom data.table as.data.table
 #' @examples
 #' data(rksp.0)
 #' stat <- figureStatistics(rksp.0$mtext, names = FALSE)
 #' @export
-figureStatistics <- function(t, names = FALSE, normalize = FALSE) {
+figureStatistics <- function(t, names = FALSE, normalize = FALSE, segment=c("Drama","Act","Scene")) {
 
   # prevent notes in R CMD check
   . <- NULL
@@ -35,18 +36,46 @@ figureStatistics <- function(t, names = FALSE, normalize = FALSE) {
   if (names == TRUE) {
     b <- quote(Speaker.figure_surface)
   }
+  segment <- match.arg(segment)
+
   setkey(t, "corpus", "drama")
-  r <- t[,list(tokens=length(Token.surface),
-       types=data.table::uniqueN(Token.surface),
-       utterances=data.table::uniqueN(begin),
-       utteranceLengthMean=mean(rle(begin)$lengths),
-       utteranceLengthSd=sd(rle(begin)$lengths),
-       firstBegin=min(begin),
-       lastEnd=max(end)
-       ),by=.(corpus,drama,length,eval(b))]
-
+  if (segment == "Scene") {
+    r <- t[,list(tokens=length(Token.surface),
+         types=data.table::uniqueN(Token.surface),
+         utterances=data.table::uniqueN(begin),
+         utteranceLengthMean=mean(rle(begin)$lengths),
+         utteranceLengthSd=sd(rle(begin)$lengths),
+         firstBegin=min(begin),
+         lastEnd=max(end)
+         ),by=.(corpus,drama,begin.Act,begin.Scene,length,eval(b))][,begin.Scene:=as.integer(as.factor(begin.Scene)),begin.Act]
+    r$begin.Act <- as.roman(as.integer(as.factor(r$begin.Act)))
+    colnames(r)[3:4] <- c("Act","Scene")
+    fcol <- 6
+  } else if (segment == "Act") {
+    r <- t[,list(tokens=length(Token.surface),
+                 types=data.table::uniqueN(Token.surface),
+                 utterances=data.table::uniqueN(begin),
+                 utteranceLengthMean=mean(rle(begin)$lengths),
+                 utteranceLengthSd=sd(rle(begin)$lengths),
+                 firstBegin=min(begin),
+                 lastEnd=max(end)
+    ),by=.(corpus,drama,begin.Act,length,eval(b))]
+    r$begin.Act <- as.roman(as.integer(as.factor(r$begin.Act)))
+    colnames(r)[3] <- "Act"
+    fcol <- 5
+  } else {
+    r <- t[,list(tokens=length(Token.surface),
+                 types=data.table::uniqueN(Token.surface),
+                 utterances=data.table::uniqueN(begin),
+                 utteranceLengthMean=mean(rle(begin)$lengths),
+                 utteranceLengthSd=sd(rle(begin)$lengths),
+                 firstBegin=min(begin),
+                 lastEnd=max(end)
+    ),by=.(corpus,drama,length,eval(b))]
+    fcol <- 4
+  }
 
-  colnames(r)[4] <- "figure"
+  colnames(r)[fcol] <- "figure"
   if (normalize == TRUE) {
     r$tokens <- r$tokens / r$length
     r$utterances <- ave(r$utterances, r$drama, FUN=function(x) {x/sum(x)})

diff --git a/R/load.R b/R/load.R
@@ -1,16 +1,28 @@
 
-#' This function initialises the import from XMI files.
+#' This function initialises the paths to data files.
 #' @param dataDirectory A path to the directory in which data and metadata are located. 
-#' "~/QuaDramA/Data" by default.
+#' "~/QuaDramA/Data2" by default.
 #' @param collectionDirectory A path to the directory in which collections are stored. 
 #' By default, the directory is called "collection" below the data directory.
 #' @export
 setup <- function(dataDirectory = file.path(path.expand("~"),"QuaDramA","Data2"), 
                   collectionDirectory = file.path(dataDirectory,"collections")) {
+  message("Since 2.1 it is no longer necessary to call setup() if you're happy with the default paths.")
   options(qd.datadir=dataDirectory)
   options(qd.collectionDirectory=collectionDirectory)
 }
 
+#' @export
+#' @rdname setup
+setDataDirectory <- function(dataDirectory = file.path(path.expand("~"),"QuaDramA","Data2")) {
+  options(qd.datadir=dataDirectory)
+}
+
+#' @export
+#' @rdname setup
+setCollectionDirectory <- function(collectionDirectory = file.path(getOption("qd.datadir"), "collections")) {
+  options(qd.collectionDirectory=collectionDirectory)
+}
 
 #' @importFrom utils read.table
 loadSetsInternally <- function() {

diff --git a/R/onLoad.R b/R/onLoad.R
diff --git a/R/plots.R b/R/plots.R
@@ -44,11 +44,13 @@ plotUtterancePositions <- function(utteranceStatistics,segmentedText=NULL,colors
 #' @importFrom fmsb radarchart
 #' @export
 #' @examples 
+#' \dontrun{
 #' data(rksp.0)
 #' fnames <- c("Krieg", "Liebe", "Familie", "Ratio","Religion")
 #' ds <- dictionaryStatistics(rksp.0$mtext, normalizeByField=TRUE, names=TRUE,
 #'                               fieldnames=fnames, asList=TRUE)
 #' plotSpiderWebs(dstat=ds,max=50)
+#' }
 plotSpiderWebs <- function(dstat=NULL, mat=dstat$mat, names=dstat$figure, 
                            symbols=c(17,16,15,4,8),
                            maxValue=max(mat),minValue=min(mat), cglcol="black", 

diff --git a/R/text.R b/R/text.R
@@ -23,28 +23,38 @@ qd.colors <- c(rgb(120,28,129, maxColorValue = 255),
 #' @param text The dramatic text in table form
 #' @param by A character vector, either "rank" or "tokens" (or unambigious sub string)
 #' @param threshold A number specifying the limit
+#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
 #' @export
 #' @examples 
 #' data(rksp.0)
 #' text.top10 <- limitFigures(rksp.0$mtext)
-limitFigures <- function(text, by=c("rank","tokens"), threshold=ifelse(by=="tokens",500,10)) {
+limitFigures <- function(text, by=c("rank","tokens"), threshold=ifelse(by=="tokens",500,10), other=FALSE) {
   by <- match.arg(by)
   switch(by,
-         tokens=limitFiguresByTokens(text, minTokens=threshold),
-         rank=limitFiguresByRank(text, maxRank = threshold),
+         tokens=limitFiguresByTokens(text, minTokens=threshold, other=other),
+         rank=limitFiguresByRank(text, maxRank = threshold, other=other),
          stop("Invalid filtering criterion"))
 }
 
 #' This method removes the spoken tokens of all but the most frequent n figures
 #' @param t The text, a data frame listing each token for each figure
 #' @param maxRank Up to maxRank figures remain in the data set
+#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
 #' @keywords internal
 #' @importFrom utils head
-limitFiguresByRank <- function(t, maxRank=10) {
-  counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id), length)
-  counts <- counts[order(counts$x, decreasing = TRUE),]
-  rcounts <- Reduce(rbind, by(counts, counts["Group.1"], head, n=maxRank))
-  r <- t[paste(t$drama, t$Speaker.figure_id) %in% paste(rcounts$Group.1, rcounts$Group.2),]
+limitFiguresByRank <- function(t, maxRank=10, other=FALSE) {
+  if (other == FALSE) {
+    r <- t[,n:=.N,.(corpus,drama,Speaker.figure_surface)][,.SD[n%in%maxN(unique(n),maxRank)], by=.(corpus,drama)][,n:=NULL,][]
+  } else {
+    counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id, t$Speaker.figure_surface), length)
+    counts <- counts[order(counts$x, decreasing = TRUE),]
+    rcounts <- Reduce(rbind, by(counts, counts["Group.1"], head, n=maxRank))
+    r <- t
+    levels(r$Speaker.figure_id) <- c(levels(r$Speaker.figure_id),"OTHER")
+    levels(r$Speaker.figure_surface) <- c(levels(r$Speaker.figure_surface),"OTHER")
+    r$Speaker.figure_id[!(r$Speaker.figure_id %in% rcounts$Group.2)] <- "OTHER"
+    r$Speaker.figure_surface[!(r$Speaker.figure_surface %in% rcounts$Group.3)] <- "OTHER"
+  }
   r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
   r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
   r
@@ -53,14 +63,23 @@ limitFiguresByRank <- function(t, maxRank=10) {
 #' This method removes the spoken tokens by all figures that speak infrequently.
 #' @param t The text, a data frame listing each token for each figure
 #' @param minTokens The minimal amount of tokens a figure has to speak
+#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
 #' @keywords internal
-limitFiguresByTokens <- function(t, minTokens=100) {
-    counts <- tapply(t$Speaker.figure_surface, paste(t$drama, t$Speaker.figure_id), length)
-    write(paste(length(counts[counts > minTokens]), "figures remaining."),stderr())
-    r <- subset(t, counts[paste(t$drama, t$Speaker.figure_id)] > minTokens )
-    r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
-    r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
-    r
+limitFiguresByTokens <- function(t, minTokens=100, other=FALSE) {
+  if (other == FALSE) {
+    r <- t[,n:=.N,.(corpus,drama,Speaker.figure_surface)][,.SD[n>=minTokens],by=.(corpus,drama)][,n:=NULL][]
+  } else {
+    counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id, t$Speaker.figure_surface), length)
+    rcounts <- counts[(counts$x > minTokens),]
+    r <- t
+    levels(r$Speaker.figure_id) <- c(levels(r$Speaker.figure_id),"OTHER")
+    levels(r$Speaker.figure_surface) <- c(levels(r$Speaker.figure_surface),"OTHER")
+    r$Speaker.figure_id[!(r$Speaker.figure_id %in% rcounts$Group.2)] <- "OTHER"
+    r$Speaker.figure_surface[!(r$Speaker.figure_surface %in% rcounts$Group.3)] <- "OTHER"
+  }
+  r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
+  r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
+  r
 }
 
 
@@ -75,6 +94,33 @@ limit.figures.by.tokens <- function(...) {
   limitFiguresByTokens(...)
 }
 
+#' @title Filtering Mentioned Figures
+#' @description This function can be used to remove the mentions of figures 
+#' that do not appear as speakers in the subsetted input text (after using 
+#' limitFigures(), for example), or to summarize them as 'OTHER'.
+#' @param t The text, a data frame listing each token for each figure
+#' @param other Whether to summarize mentioned figures as 'OTHER'
+#' @export
+#' @examples 
+#' data(rksp.0)
+#' text.top10.filtered <- filterMentioned(limitFigures(rksp.0$mtext))
+filterMentioned <- function(t, other=FALSE) {
+  figure_id.set <- unique(t$Speaker.figure_id)
+  figure_surface.set <- unique(t$Speaker.figure_surface)
+  if (other == FALSE) {
+    t$Mentioned.figure_id[!(t$Mentioned.figure_id %in% figure_id.set)] <- NA
+    t$Mentioned.figure_surface[!t$Mentioned.figure_surface %in% figure_surface.set] <- NA
+  } else {
+    levels(t$Mentioned.figure_id) <- c(levels(t$Mentioned.figure_id),"OTHER")
+    levels(t$Mentioned.figure_surface) <- c(levels(t$Mentioned.figure_surface),"OTHER")
+    t$Mentioned.figure_id[!(t$Mentioned.figure_id %in% figure_id.set) & !(is.na(t$Mentioned.figure_id))] <- "OTHER"
+    t$Mentioned.figure_surface[!(t$Mentioned.figure_surface %in% figure_surface.set) & !(is.na(t$Mentioned.figure_surface))] <- "OTHER"
+  }
+  t$Mentioned.figure_id <- droplevels(t$Mentioned.figure_id)
+  t$Mentioned.figure_surface <- droplevels(t$Mentioned.figure_surface)
+  t
+}
+
 tfidf1 <- function(word) {
   docfreq <- sum(word>0)
   docfreq <- log((length(word)+1) / (sum(word>0)))
@@ -126,15 +172,26 @@ extractTopTerms <- function(mat, top=10) {
 
 #' @title Report
 #' @description generates a report for a specific dramatic text
-#' @param id The id of the text
+#' @param id The id of the text or a list of ids
 #' @param of The output file
-#' @param colors A list of colors to be used for plots
+#' @param type The type of the report. "Single" gives a report about a single play, 
+#' while "Compare" can be used to compare multiple editions of a play
+#' @param ... Arguments passed through to the rmarkdown document
 #' @importFrom rmarkdown render
 #' @importFrom igraph graph_from_adjacency_matrix plot.igraph layout_ on_grid
 #' @export
-report <- function(id="test:rksp.0", of=file.path(getwd(),paste0(unlist(strsplit(id,":",fixed=TRUE))[2], ".html")), colors=qd.colors) {
+report <- function(id="test:rksp.0", 
+                   of=file.path(getwd(),paste0(unlist(strsplit(id,":",fixed=TRUE))[2], ".html")), 
+                   type=c("Single"),
+                   ...) {
   force(of)
-  rmarkdown::render(system.file("rmd/Report.Rmd", package="DramaAnalysis"), params=list(id=id, col=colors), 
+  type <- match.arg(type)
+
+  fileName <- switch(type,
+         Single="Report.Rmd",
+         Compare="Compare-editions.Rmd")
+  rmarkdown::render(system.file(paste0("rmd/",fileName), package="DramaAnalysis"), 
+                    params=list(id=id,col=qd.colors,...), 
                     output_format = "html_document", 
                     output_file = of)
 }
@@ -231,3 +288,12 @@ last <- function(x, n=0) {
   }
   sort(x,partial=len-(n-1))[len-(n-1)]
 }
+
+maxN <- function(x, N=2){
+  len <- length(x)
+  if(N>len){
+    warning('N greater than length(x).  Setting N=length(x)')
+    N <- length(x)
+  }
+  sort(x,partial=len-N+1)[(len-N+1):(len)]
+}
diff --git a/R/zzz.R b/R/zzz.R
@@ -0,0 +1,10 @@
+.onLoad <- function(libname, pkgname) {
+  dataDirectory <- file.path(path.expand("~"),"QuaDramA","Data2")
+  collectionDirectory <- file.path(dataDirectory,"collections")
+  options(qd.datadir=dataDirectory)
+  options(qd.collectionDirectory=collectionDirectory)
+}
+
+.onAttach <- function(libname, pkgname) {
+  packageStartupMessage(paste("DramaAnalysis",utils::packageVersion("DramaAnalysis")))
+}