forked from rll307/WorkshopTwitter2022_EN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path09_quanteda_E3.R
104 lines (70 loc) · 2.9 KB
/
09_quanteda_E3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
writeLines("It is part of my CNPq-funded project and seeks to make corpus tools and R accessible. If you have any doubts or wish to make any research contact please send me an email. Rodrigo de Lima-Lopes rll307@unicamp.br")
# Packages ----------------------------------------------------------------
library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)
#Creating the corpus
presidents.C <- corpus(presidents)
# Which are the variables?
head(docvars(presidents.C))
#Creating subcorpora
gabrielboric.c <- corpus_subset(presidents.C, screen_name == "gabrielboric")
sebastianpinera.c <- corpus_subset(presidents.C, screen_name == "sebastianpinera")
# Tokenisation
# Boric
gabrielboric.toc <- tokens(gabrielboric.c,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
verbose = TRUE)
gabrielboric.toc <- tokens_remove(gabrielboric.toc,
stopwords("es"),
valuetype = "fixed",
verbose = TRUE
) %>% tokens_tolower()
#Piñeda
sebastianpinera.toc <- tokens(sebastianpinera.c,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
verbose = TRUE)
sebastianpinera.toc <- tokens_remove(sebastianpinera.toc,
stopwords("es"),
valuetype = "fixed",
verbose = TRUE
) %>% tokens_tolower()
# Kwic
kwic(gabrielboric.toc,"chile") |> View()
kwic(sebastianpinera.toc,"chile") |> View()
#Bigrams
gabrielboric.col <- textstat_collocations(gabrielboric.toc, method = "lambda",
size = 2,
min_count = 2,
smoothing = 0.5,
tolower = TRUE,
verbose = TRUE)
sebastianpinera.col <- textstat_collocations(sebastianpinera.toc, method = "lambda",
size = 2,
min_count = 2,
smoothing = 0.5,
tolower = TRUE,
verbose = TRUE)
# Let us see it
View(gabrielboric.col)
View(sebastianpinera.col)
# Boric vs Piñeda
presidents.toc <- tokens(presidents.C,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
verbose = TRUE) %>%
tokens_remove(pattern = my.stopwords) %>%
tokens_tolower() %>%
tokens_group(groups = screen_name)
dfm.pres <- dfm(presidents.toc, verbose = TRUE)
# Now Plotting
textstat_keyness(dfm.pres,
target = "gabrielboric",
measure = "lr") |>
textplot_keyness(n= 25)