forked from kbenoit/ITAUR-Short
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadvanced.Rmd
102 lines (85 loc) · 3.39 KB
/
advanced.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
---
title: "Advanced Text Analysis and Extensions"
author: "Kenneth Benoit and Paul Nulty"
date: "30th November 2015"
output: html_document
---
In this section we will explore some text analysis and analysis of metadata from a corpus of tweets retrieved from the Twitter API. The tweets are a small sample from a collection of tweets relating to the European Parliament elections of 2015.
Load the data frame containing the sample tweets:
```{r}
require(quanteda)
load("tweetSample.RData")
str(tweetSample)
```
```{r}
require(lubridate)
require(dplyr)
tweetSample <- mutate(tweetSample, day = yday(created_at))
tweetSample <- mutate(tweetSample, dayDate = as.Date(day-1, origin = "2014-01-01"))
juncker <- filter(tweetSample, grepl('juncker', text, ignore.case=TRUE)) %>% mutate(kand='Juncker')
schulz <- filter(tweetSample, grepl('schulz', text, ignore.case=TRUE)) %>% mutate(kand='Schulz')
verhof <- filter(tweetSample, grepl('verhofstadt', text, ignore.case=TRUE)) %>% mutate(kand='Verhofstadt')
spitzAll <- bind_rows(juncker, schulz, verhof)
```
Once the data is in the correct format, we can use ggplot to display the candidate mentions on the a single plot:
```{r}
require(ggplot2)
require(scales)
# mentioning kandidates names over time
plotDf <- count(spitzAll, kand, day=day) %>% mutate(day=as.Date(day-1, origin = "2014-01-01"))
ggplot(data=plotDf, aes(x=day, y=n, colour=kand)) +
geom_line(size=1) +
scale_y_continuous(labels = comma) + geom_vline(xintercept=as.numeric(as.Date("2014-05-15")), linetype=4) +
geom_vline(xintercept=as.numeric(as.Date("2014-05-25")), linetype=4) +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))
```
We can use the `keptFeatures` argument to `dfm()` to analyse only hashtags for each candidate's text.
```{r}
# Top hashtags for tweets that mention Juncker
dv <- data.frame(user = juncker$user_screen_name)
jCorp <- corpus(juncker$text, docvars = dv)
jd <- dfm(jCorp)
jd <- selectFeatures(jd, "^#.*", "keep", valuetype = "regex")
# equivalent: jd <- selectFeatures(jd, "#*", "keep", valuetype = "glob")
topfeatures(jd, nfeature(jd))
```
## Further analysis examples
Wordscores:
```{r}
data(amicusCorpus, package = "quantedaData")
refs <- docvars(amicusCorpus, "trainclass")
refs <- (as.numeric(refs) - 1.5)*2
amicusDfm <- dfm(amicusCorpus, verbose = FALSE)
wm <- textmodel(amicusDfm, y = refs, model = "wordscores")
summary(wm)
preds <- predict(wm, newdata = amicusDfm)
summary(preds)
plot(preds@textscores$textscore_raw ~ docvars(amicusCorpus, "testclass"),
horizontal = TRUE, xlab = "Predicted document score",
ylab = "Test class", las = 1)
```
Correspondence analysis:
```{r, fig.width = 6, fig.height = 6}
ieDfm <- dfm(ie2010Corpus, verbose = FALSE)
ieCA <- textmodel(ieDfm, model = "ca")
require(ca)
plot(ieCA, what = c("all", "none"))
```
Poisson scaling:
```{r, fig.width = 6, fig.height = 4}
ieWF <- textmodel(ieDfm, model = "wordfish")
summary(ieWF)
dotchart(ieWF@theta,
labels = paste(docvars(ie2010Corpus, "name"), docvars(ie2010Corpus, "party")))
```
Topic models:
```{r}
require(topicmodels)
mycorpus <- subset(inaugCorpus, Year>1950)
quantdfm <- dfm(mycorpus, verbose=FALSE, stem=TRUE,
ignoredFeatures=c(stopwords('english'),'will','us','nation', 'can','peopl*','americ*'))
ldadfm <- convert(quantdfm, to="topicmodels")
lda <- LDA(ldadfm, control = list(alpha = 0.1), k=20)
terms(lda, 10)
```