-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_init.R
59 lines (47 loc) · 1.93 KB
/
data_init.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
starttime = Sys.time()
library(tosca)
rootpath = "//media/TextMining/DoCMA/data"
setwd(file.path(rootpath, "Working_Paper_Uncertainty"))
objlist = list()
## Welt
file = file.path(rootpath, "Welt", "welt-data.rds")
obj = filterDate(readRDS(file), s.date = as.Date("2001-01-01"))
obj = filterID(obj, obj$meta$id[obj$meta$resource %in% c("Die Welt", "DIE WELT")])
objlist[[1]] = obj
## SZ
file = file.path(rootpath, "Sueddeutsche", "SZ-data.rds")
obj = filterDate(readRDS(file), s.date = as.Date("2001-01-01"))
obj$text = lapply(obj$text, function(x) paste(unlist(x), collapse = " "))
objlist[[2]] = obj
## HB
file = file.path(rootpath, "HB_WiWo", "HB", "HB-data.rds")
objlist[[3]] = filterDate(readRDS(file), s.date = as.Date("2001-01-01"))
## merge Welt, SZ, HB
obj = mergeTextmeta(objlist)
rm(objlist)
gc()
obj$text = removeUmlauts(obj$text)
obj$text = removeHTML(obj$text, hex = FALSE, symbols = TRUE)
obj$text = removeUmlauts(obj$text)
obj$text = removeXML(obj$text)
obj$text = lapply(obj$text, function(x) gsub("&[^;]*;", " ", x))
obj$text = lapply(obj$text, function(x) gsub("\u00AD", "", x))
obj$text = lapply(obj$text, function(x) gsub("\u00A0", "", x))
obj$meta = obj$meta[order(obj$meta$date),]
obj$text = obj$text[match(obj$meta$id, names(obj$text))]
obj$text = obj$text[!duplicated(obj$text)]
obj$meta = obj$meta[obj$meta$id %in% names(obj$text),]
## save objects
new = !"obj_updated.rds" %in% list.files()
saveRDS(obj, "obj_init.rds")
if(new) saveRDS(obj, "obj_updated.rds")
id = names(which(filterWord(obj$text, "wirtschaft", ignore.case = TRUE, out = "bin")))
obj = filterID(obj, id)
saveRDS(obj, "obj_init_wirtschaft.rds")
if(new) saveRDS(obj, "obj_updated_wirtschaft.rds")
id = names(which(filterWord(obj$text, "unsicher", ignore.case = TRUE, out = "bin")))
obj = filterID(obj, id)
saveRDS(obj, "obj_init_wirtschaft_unsicher.rds")
if(new) saveRDS(obj, "obj_updated_wirtschaft_unsicher.rds")
gc()
difftime(Sys.time(), starttime, units = "hours")