Skip to content

Commit

Permalink
Merged release/2.1.0 into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Nils Reiter committed May 25, 2018
2 parents de95ebb + e4d2387 commit ad4d7b3
Show file tree
Hide file tree
Showing 24 changed files with 903 additions and 142 deletions.
8 changes: 1 addition & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
language: r
cache: packages
sudo: false
cache: packages
warnings_are_errors: false
addons:
apt:
packages:
ed
notifications:
slack: quadrama:0jGO6p0xkb5l0vSK00tQSJMr
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: DramaAnalysis
Type: Package
Title: Scripts to support analysis of dramatic texts
Version: 2.0.2
Date: 2018-03-20
Version: 2.1.0
Date: 2018-05-25
Author: Nils Reiter <nils.reiter@ims.uni-stuttgart.de>, Tim Strohmayer <st151528@stud.uni-stuttgart.de>
Maintainer: Nils Reiter <nils.reiter@ims.uni-stuttgart.de>
Description: This package can be used to import and analyse pre-processed dramatic texts.
Expand All @@ -25,7 +25,8 @@ Depends:
R (>= 3.0.0)
RoxygenNote: 6.0.1
Suggests: testthat,
knitr
knitr,
highcharter
Encoding: UTF-8
VignetteBuilder: knitr
Remotes: bmschmidt/wordVectors@2.0
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export(enrichDictionary)
export(ensureSuffix)
export(figureStatistics)
export(figurematrix)
export(filterMentioned)
export(frequencytable)
export(hamming)
export(installCollectionData)
Expand All @@ -35,6 +36,8 @@ export(rankFiguresByDramatisPersonae)
export(regroup)
export(report)
export(scenicDifference)
export(setCollectionDirectory)
export(setDataDirectory)
export(setup)
export(tfidf)
export(utteranceStatistics)
Expand Down
14 changes: 12 additions & 2 deletions R/dictionaryStatistics.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,11 @@ enrichDictionary <- function(dictionary, model, top=100, minimalSimilarity=0.4)
#' @seealso \code{\link{loadFields}}
#' @rdname dictionaryStatistics
#' @examples
#' \dontrun{
#' # Check multiple dictionary entries
#' data(rksp.0)
#' dstat <- dictionaryStatistics(rksp.0$mtext, fieldnames=c("Krieg","Familie"), names=TRUE)
#' }
#' @export
dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
fieldnames=c("Liebe"),
Expand Down Expand Up @@ -132,6 +134,14 @@ dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
names=names, column=column)
colnames(dss)[ncol(dss)] <- x
if (x == names(fields)[[1]]) {
if (segment=="Scene") {
u <- unique(t[,c("begin.Scene","Number.Act", "Number.Scene")])
dss <- merge(dss, u,
by.x="begin.Scene",
by.y="begin.Scene")
dss$begin.Scene <- NULL
data.table::setcolorder(dss, c("corpus","drama","Number.Act","Number.Scene","figure",x))
}
dss
} else {
dss[,x,with=FALSE]
Expand Down Expand Up @@ -163,7 +173,7 @@ dictionaryStatistics <- function(t, fields=loadFields(fieldnames,baseurl),
rownames(l$mat) <- switch(segment,
Drama=as.character(l$figure),
Act=paste(l$figure,utils::as.roman(l$Number.Act)),
Scene=paste(l$figure,utils::as.roman(l$Number.Act),l$Number.Scene))
Scene=paste(l$figure,l$begin.Scene))
l
} else {
r
Expand Down Expand Up @@ -209,7 +219,7 @@ dictionaryStatisticsSingle <- function(t, wordfield=c(),
switch(segment,
Drama=c("drama"),
Act=c("drama","Number.Act"),
Scene=c("drama","Number.Act","Number.Scene"))
Scene=c("drama","begin.Scene"))
)
if (byFigure == TRUE) {
bycolumns <- c(bycolumns, ifelse(names==TRUE,
Expand Down
51 changes: 40 additions & 11 deletions R/figures.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
#' if necessary.
#' @param names If set to true, the table will contains figure names instead of ids
#' @param normalize Normalising the individual columns
#' @param segment "Drama", "Act", or "Scene". Allows calculating statistics on segments of the play
#' @importFrom stats sd
#' @importFrom stats aggregate
#' @importFrom data.table as.data.table
#' @examples
#' data(rksp.0)
#' stat <- figureStatistics(rksp.0$mtext, names = FALSE)
#' @export
figureStatistics <- function(t, names = FALSE, normalize = FALSE) {
figureStatistics <- function(t, names = FALSE, normalize = FALSE, segment=c("Drama","Act","Scene")) {

# prevent notes in R CMD check
. <- NULL
Expand All @@ -35,18 +36,46 @@ figureStatistics <- function(t, names = FALSE, normalize = FALSE) {
if (names == TRUE) {
b <- quote(Speaker.figure_surface)
}
segment <- match.arg(segment)

setkey(t, "corpus", "drama")
r <- t[,list(tokens=length(Token.surface),
types=data.table::uniqueN(Token.surface),
utterances=data.table::uniqueN(begin),
utteranceLengthMean=mean(rle(begin)$lengths),
utteranceLengthSd=sd(rle(begin)$lengths),
firstBegin=min(begin),
lastEnd=max(end)
),by=.(corpus,drama,length,eval(b))]

if (segment == "Scene") {
r <- t[,list(tokens=length(Token.surface),
types=data.table::uniqueN(Token.surface),
utterances=data.table::uniqueN(begin),
utteranceLengthMean=mean(rle(begin)$lengths),
utteranceLengthSd=sd(rle(begin)$lengths),
firstBegin=min(begin),
lastEnd=max(end)
),by=.(corpus,drama,begin.Act,begin.Scene,length,eval(b))][,begin.Scene:=as.integer(as.factor(begin.Scene)),begin.Act]
r$begin.Act <- as.roman(as.integer(as.factor(r$begin.Act)))
colnames(r)[3:4] <- c("Act","Scene")
fcol <- 6
} else if (segment == "Act") {
r <- t[,list(tokens=length(Token.surface),
types=data.table::uniqueN(Token.surface),
utterances=data.table::uniqueN(begin),
utteranceLengthMean=mean(rle(begin)$lengths),
utteranceLengthSd=sd(rle(begin)$lengths),
firstBegin=min(begin),
lastEnd=max(end)
),by=.(corpus,drama,begin.Act,length,eval(b))]
r$begin.Act <- as.roman(as.integer(as.factor(r$begin.Act)))
colnames(r)[3] <- "Act"
fcol <- 5
} else {
r <- t[,list(tokens=length(Token.surface),
types=data.table::uniqueN(Token.surface),
utterances=data.table::uniqueN(begin),
utteranceLengthMean=mean(rle(begin)$lengths),
utteranceLengthSd=sd(rle(begin)$lengths),
firstBegin=min(begin),
lastEnd=max(end)
),by=.(corpus,drama,length,eval(b))]
fcol <- 4
}

colnames(r)[4] <- "figure"
colnames(r)[fcol] <- "figure"
if (normalize == TRUE) {
r$tokens <- r$tokens / r$length
r$utterances <- ave(r$utterances, r$drama, FUN=function(x) {x/sum(x)})
Expand Down
16 changes: 14 additions & 2 deletions R/load.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@

#' This function initialises the import from XMI files.
#' This function initialises the paths to data files.
#' @param dataDirectory A path to the directory in which data and metadata are located.
#' "~/QuaDramA/Data" by default.
#' "~/QuaDramA/Data2" by default.
#' @param collectionDirectory A path to the directory in which collections are stored.
#' By default, the directory is called "collection" below the data directory.
#' @export
setup <- function(dataDirectory = file.path(path.expand("~"),"QuaDramA","Data2"),
collectionDirectory = file.path(dataDirectory,"collections")) {
message("Since 2.1 it is no longer necessary to call setup() if you're happy with the default paths.")
options(qd.datadir=dataDirectory)
options(qd.collectionDirectory=collectionDirectory)
}

#' @export
#' @rdname setup
setDataDirectory <- function(dataDirectory = file.path(path.expand("~"),"QuaDramA","Data2")) {
options(qd.datadir=dataDirectory)
}

#' @export
#' @rdname setup
setCollectionDirectory <- function(collectionDirectory = file.path(getOption("qd.datadir"), "collections")) {
options(qd.collectionDirectory=collectionDirectory)
}

#' @importFrom utils read.table
loadSetsInternally <- function() {
Expand Down
Empty file removed R/onLoad.R
Empty file.
2 changes: 2 additions & 0 deletions R/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ plotUtterancePositions <- function(utteranceStatistics,segmentedText=NULL,colors
#' @importFrom fmsb radarchart
#' @export
#' @examples
#' \dontrun{
#' data(rksp.0)
#' fnames <- c("Krieg", "Liebe", "Familie", "Ratio","Religion")
#' ds <- dictionaryStatistics(rksp.0$mtext, normalizeByField=TRUE, names=TRUE,
#' fieldnames=fnames, asList=TRUE)
#' plotSpiderWebs(dstat=ds,max=50)
#' }
plotSpiderWebs <- function(dstat=NULL, mat=dstat$mat, names=dstat$figure,
symbols=c(17,16,15,4,8),
maxValue=max(mat),minValue=min(mat), cglcol="black",
Expand Down
104 changes: 85 additions & 19 deletions R/text.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,38 @@ qd.colors <- c(rgb(120,28,129, maxColorValue = 255),
#' @param text The dramatic text in table form
#' @param by A character vector, either "rank" or "tokens" (or unambigious sub string)
#' @param threshold A number specifying the limit
#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
#' @export
#' @examples
#' data(rksp.0)
#' text.top10 <- limitFigures(rksp.0$mtext)
limitFigures <- function(text, by=c("rank","tokens"), threshold=ifelse(by=="tokens",500,10)) {
limitFigures <- function(text, by=c("rank","tokens"), threshold=ifelse(by=="tokens",500,10), other=FALSE) {
by <- match.arg(by)
switch(by,
tokens=limitFiguresByTokens(text, minTokens=threshold),
rank=limitFiguresByRank(text, maxRank = threshold),
tokens=limitFiguresByTokens(text, minTokens=threshold, other=other),
rank=limitFiguresByRank(text, maxRank = threshold, other=other),
stop("Invalid filtering criterion"))
}

#' This method removes the spoken tokens of all but the most frequent n figures
#' @param t The text, a data frame listing each token for each figure
#' @param maxRank Up to maxRank figures remain in the data set
#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
#' @keywords internal
#' @importFrom utils head
limitFiguresByRank <- function(t, maxRank=10) {
counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id), length)
counts <- counts[order(counts$x, decreasing = TRUE),]
rcounts <- Reduce(rbind, by(counts, counts["Group.1"], head, n=maxRank))
r <- t[paste(t$drama, t$Speaker.figure_id) %in% paste(rcounts$Group.1, rcounts$Group.2),]
limitFiguresByRank <- function(t, maxRank=10, other=FALSE) {
if (other == FALSE) {
r <- t[,n:=.N,.(corpus,drama,Speaker.figure_surface)][,.SD[n%in%maxN(unique(n),maxRank)], by=.(corpus,drama)][,n:=NULL,][]
} else {
counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id, t$Speaker.figure_surface), length)
counts <- counts[order(counts$x, decreasing = TRUE),]
rcounts <- Reduce(rbind, by(counts, counts["Group.1"], head, n=maxRank))
r <- t
levels(r$Speaker.figure_id) <- c(levels(r$Speaker.figure_id),"OTHER")
levels(r$Speaker.figure_surface) <- c(levels(r$Speaker.figure_surface),"OTHER")
r$Speaker.figure_id[!(r$Speaker.figure_id %in% rcounts$Group.2)] <- "OTHER"
r$Speaker.figure_surface[!(r$Speaker.figure_surface %in% rcounts$Group.3)] <- "OTHER"
}
r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
r
Expand All @@ -53,14 +63,23 @@ limitFiguresByRank <- function(t, maxRank=10) {
#' This method removes the spoken tokens by all figures that speak infrequently.
#' @param t The text, a data frame listing each token for each figure
#' @param minTokens The minimal amount of tokens a figure has to speak
#' @param other Whether to summarize filtered figures as 'OTHER' instead of removing their speech
#' @keywords internal
limitFiguresByTokens <- function(t, minTokens=100) {
counts <- tapply(t$Speaker.figure_surface, paste(t$drama, t$Speaker.figure_id), length)
write(paste(length(counts[counts > minTokens]), "figures remaining."),stderr())
r <- subset(t, counts[paste(t$drama, t$Speaker.figure_id)] > minTokens )
r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
r
limitFiguresByTokens <- function(t, minTokens=100, other=FALSE) {
if (other == FALSE) {
r <- t[,n:=.N,.(corpus,drama,Speaker.figure_surface)][,.SD[n>=minTokens],by=.(corpus,drama)][,n:=NULL][]
} else {
counts <- aggregate(t$Speaker.figure_surface, by=list(t$drama, t$Speaker.figure_id, t$Speaker.figure_surface), length)
rcounts <- counts[(counts$x > minTokens),]
r <- t
levels(r$Speaker.figure_id) <- c(levels(r$Speaker.figure_id),"OTHER")
levels(r$Speaker.figure_surface) <- c(levels(r$Speaker.figure_surface),"OTHER")
r$Speaker.figure_id[!(r$Speaker.figure_id %in% rcounts$Group.2)] <- "OTHER"
r$Speaker.figure_surface[!(r$Speaker.figure_surface %in% rcounts$Group.3)] <- "OTHER"
}
r$Speaker.figure_id <- droplevels(r$Speaker.figure_id)
r$Speaker.figure_surface <- droplevels(r$Speaker.figure_surface)
r
}


Expand All @@ -75,6 +94,33 @@ limit.figures.by.tokens <- function(...) {
limitFiguresByTokens(...)
}

#' @title Filtering Mentioned Figures
#' @description This function can be used to remove the mentions of figures
#' that do not appear as speakers in the subsetted input text (after using
#' limitFigures(), for example), or to summarize them as 'OTHER'.
#' @param t The text, a data frame listing each token for each figure
#' @param other Whether to summarize mentioned figures as 'OTHER'
#' @export
#' @examples
#' data(rksp.0)
#' text.top10.filtered <- filterMentioned(limitFigures(rksp.0$mtext))
filterMentioned <- function(t, other=FALSE) {
figure_id.set <- unique(t$Speaker.figure_id)
figure_surface.set <- unique(t$Speaker.figure_surface)
if (other == FALSE) {
t$Mentioned.figure_id[!(t$Mentioned.figure_id %in% figure_id.set)] <- NA
t$Mentioned.figure_surface[!t$Mentioned.figure_surface %in% figure_surface.set] <- NA
} else {
levels(t$Mentioned.figure_id) <- c(levels(t$Mentioned.figure_id),"OTHER")
levels(t$Mentioned.figure_surface) <- c(levels(t$Mentioned.figure_surface),"OTHER")
t$Mentioned.figure_id[!(t$Mentioned.figure_id %in% figure_id.set) & !(is.na(t$Mentioned.figure_id))] <- "OTHER"
t$Mentioned.figure_surface[!(t$Mentioned.figure_surface %in% figure_surface.set) & !(is.na(t$Mentioned.figure_surface))] <- "OTHER"
}
t$Mentioned.figure_id <- droplevels(t$Mentioned.figure_id)
t$Mentioned.figure_surface <- droplevels(t$Mentioned.figure_surface)
t
}

tfidf1 <- function(word) {
docfreq <- sum(word>0)
docfreq <- log((length(word)+1) / (sum(word>0)))
Expand Down Expand Up @@ -126,15 +172,26 @@ extractTopTerms <- function(mat, top=10) {

#' @title Report
#' @description generates a report for a specific dramatic text
#' @param id The id of the text
#' @param id The id of the text or a list of ids
#' @param of The output file
#' @param colors A list of colors to be used for plots
#' @param type The type of the report. "Single" gives a report about a single play,
#' while "Compare" can be used to compare multiple editions of a play
#' @param ... Arguments passed through to the rmarkdown document
#' @importFrom rmarkdown render
#' @importFrom igraph graph_from_adjacency_matrix plot.igraph layout_ on_grid
#' @export
report <- function(id="test:rksp.0", of=file.path(getwd(),paste0(unlist(strsplit(id,":",fixed=TRUE))[2], ".html")), colors=qd.colors) {
report <- function(id="test:rksp.0",
of=file.path(getwd(),paste0(unlist(strsplit(id,":",fixed=TRUE))[2], ".html")),
type=c("Single"),
...) {
force(of)
rmarkdown::render(system.file("rmd/Report.Rmd", package="DramaAnalysis"), params=list(id=id, col=colors),
type <- match.arg(type)

fileName <- switch(type,
Single="Report.Rmd",
Compare="Compare-editions.Rmd")
rmarkdown::render(system.file(paste0("rmd/",fileName), package="DramaAnalysis"),
params=list(id=id,col=qd.colors,...),
output_format = "html_document",
output_file = of)
}
Expand Down Expand Up @@ -231,3 +288,12 @@ last <- function(x, n=0) {
}
sort(x,partial=len-(n-1))[len-(n-1)]
}

maxN <- function(x, N=2){
len <- length(x)
if(N>len){
warning('N greater than length(x). Setting N=length(x)')
N <- length(x)
}
sort(x,partial=len-N+1)[(len-N+1):(len)]
}
10 changes: 10 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.onLoad <- function(libname, pkgname) {
dataDirectory <- file.path(path.expand("~"),"QuaDramA","Data2")
collectionDirectory <- file.path(dataDirectory,"collections")
options(qd.datadir=dataDirectory)
options(qd.collectionDirectory=collectionDirectory)
}

.onAttach <- function(libname, pkgname) {
packageStartupMessage(paste("DramaAnalysis",utils::packageVersion("DramaAnalysis")))
}
Loading

0 comments on commit ad4d7b3

Please sign in to comment.