-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
628 lines (518 loc) · 33.3 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
#==== USER SECTION =================================================================
u_path_pdf <- "data/pdfs_all"
u_path_xml <- "data/xml_all"
#==== INITIALISE ===================================================================
# load libraries
library(devtools)
library(bibliometrix)
library(SnowballC)
library(servr)
library(httpuv)
library(ggplot2)
library(tm)
#library(wordcloud)
library(dplyr)
library(tidyr)
library(stringr)
library(topicmodels)
#library(igraph)
library(jsonlite)
library(pdftools)
library(wordcloud)
#library(text2vec)
#library(NMF)
library(LDAvis)
library(xml2)
#library(googlesheets)
library(scimetrix)
#googlesheets::gs_auth()
# load functions
source("functions/pdf_functions.R")
# get list of pdfs
v_path_pdfs <- list.files(u_path_pdf, full.names = TRUE)
v_path_xmls <- list.files(u_path_xml, full.names = TRUE)
#==== PROCESS DATA =================================================================
#---- Get doc info from google sheet -----------------------------------------------
#gs <- gs_url("https://docs.google.com/spreadsheets/d/1sCPMV45U2q5E9eYm7-vJfp9TNKrGIhl34dhEEEMzkok/edit#gid=1473283493")
#gs_docs <- googlesheets::gs_read(gs, "Only references")
#save(gs_docs, file="data/gs_docs.RData")
#---- Get pdf content and structure ------------------------------------------------
#-- TODO: Join two lapply methods
v_pdf_source <- lapply(v_path_pdfs,
function(x) {
#print(basename(x))
# get pdf infos
info <- pdf_info(x)
so <- info$keys$Subject
if (!is.null(so)) {
if (grepl("doi", so)) so <- strsplit(so, ",", fixed=TRUE)[[1]][1]
} else {
metadata <- info$metadata %>% strsplit("\n")
so <- grep("source", tolower(metadata), value=T)
if (!is.null(so)) {
text <- pdf_text(x)
so <- strsplit(text[3], "\r")[[1]][1]
#if (!is.null(so)) print(" > Could not extract source.")
} else {
#print(" > Could not extract source.")
}
}
#print(paste(" > Source is: ", so))
so
})
v_pdf_summary <- lapply(v_path_pdfs,
function(x) {
print(basename(x))
# get pdf infos
info <- pdf_info(x)
# get publishing year
py <- ifelse(is.null(info$created), "", format(info$created, "%Y"))
# get number of pages
pages <- ifelse(is.null(info$pages), "", info$pages)
# Additional info
# get author info
au <- ifelse(is.null(info$keys$Author), "", info$keys$Author)
# get title
title <- ifelse(is.null(info$keys$Title), "", info$keys$Title)
if (title == "") {
title <- extract_title(x)
}
# get doi
doi <- ifelse(is.null(info$keys$doi), "", info$keys$doi)
if (doi == "" & pages <= 40) {
tryCatch(
doi <- search_doi_in_text(x),
error = function(e) {print(paste("Warning!! >>", e))}
)
}
if (doi == "" & pages <= 40) {
doi <- get_prism_doi(x)
}
if (grepl("doi:", tolower(doi))) {
doi <- trimws(strsplit(trimws(substr(doi, as.numeric(regexpr("doi:", tolower(doi)))+4, nchar(doi))), " ", fixed=T)[[1]][1])
}
if (grepl("doi.org", tolower(doi))) {
doi <- trimws(strsplit(trimws(substr(doi, as.numeric(regexpr("doi.org", tolower(doi)))+8, nchar(doi))), " ", fixed=T)[[1]][1])
}
if (grepl("doi", tolower(doi))) {
doi <- trimws(strsplit(trimws(substr(doi, as.numeric(regexpr("doi", tolower(doi)))+3, nchar(doi))), " ", fixed=T)[[1]][1])
}
so <- info$keys$Subject
if (!is.null(so)) {
if (grepl("doi", so)) so <- strsplit(so, ",", fixed=TRUE)[[1]][1]
} else {
metadata <- info$metadata %>% strsplit("\n")
so <- grep("source", tolower(metadata), value=T)
if (!is.null(so)) {
text <- pdf_text(x)
so <- strsplit(text[2], "\r")[[1]][1]
#if (!is.null(so)) print(" > Could not extract source.")
} else {
#print(" > Could not extract source.")
}
}
data.frame(
author = au,
year = py,
title = title,
doi = doi,
so = so,
pages = pages,
filename = basename(x),
path = x
)
}) %>%
do.call("rbind", .)
v_pdf_years <- lapply(v_path_pdfs,
function(x) {
print(basename(x))
# Initialise fields
py <- NA
au <- ""
type <- "article"
comment <- ""
# Find publication year
tmp <- regexpr(pattern = "([1-2][0,7,8,9][0-9][0-9])", text = basename(x))
if (as.numeric(tmp) != -1) {
py <- substr(basename(x), as.numeric(tmp), as.numeric(tmp)+3)
au <- substr(basename(x), 1, as.numeric(tmp)-2)
}
au <- gsub(",", "", au)
au <- gsub("_", " ", au)
if (!grepl("et al.", au, fixed = TRUE)) au <- gsub("et al", "et al.", au, fixed = TRUE)
# Find publication type
tmp <- grep("manual|description|documentation", basename(x), value=T, ignore.case = TRUE)
if (length(tmp) != 0) {
type <- "model documentation"
}
if (basename(x) == "Meinshausen et al. 2011 Emulating coupled atmosphere-ocean and carbon cycle models with a simpler model, MAGICC6 – Part 1 Model description and calibration.pdf") type <- "article"
# Manual adjustments
if (basename(x) == "Berger et al-IJGEI-7.pdf") au <- "Berger et al"
if (basename(x) == "Berger et al-IJGEI400103.pdf") au <- "Berger et al"
if (basename(x) == "IFs Agricultural Model Documentation V25.03.pdf") au <- "Rothman et al."
if (basename(x) == "IFs Economic model Documentation v43 clean.pdf") au <- "Hughes"
if (basename(x) == "IFs Energy model Documentation_v9.pdf") au <- "Hughes et al."
if (basename(x) == "IFs Health model Documentation v49.pdf") au <- "Hughes et al."
if (basename(x) == "IFs Infrastructure model Documentation v12 - clean.pdf") au <- "Rothman and Irfan"
if (basename(x) == "IFs Population_model Documentation_v10.pdf") au <- "Hughes"
if (basename(x) == "Berger et al-IJGEI-7.pdf") py <- 2017
if (basename(x) == "Berger et al-IJGEI400103.pdf") py <- 2017
if (basename(x) == "IFs Agricultural Model Documentation V25.03.pdf") py <- 2017
if (basename(x) == "IFs Economic model Documentation v43 clean.pdf") py <- 2015
if (basename(x) == "IFs Energy model Documentation_v9.pdf") py <- 2014
if (basename(x) == "IFs Health model Documentation v49.pdf") py <- 2014
if (basename(x) == "IFs Infrastructure model Documentation v12 - clean.pdf") py <- 2013
if (basename(x) == "IFs Population_model Documentation_v10.pdf") py <- 2014
if (basename(x) == "Alkemade et al. 2011 Applying GLOBIO at different geographical levels.pdf") type <- "book chapter"
if (basename(x) == "Biemans 2012 Water constraints on future food production.pdf") type <- "thesis"
if (basename(x) == "Bohl et al. 2016 Understanding and Forecasting Demographic Risk and Benefits.pdf") type <- "report"
if (basename(x) == "Bohl et al. 2017 Development Trends Report for South Africa.pdf") type <- "report"
if (basename(x) == "Bohl et al. 2017 Development Trends Report for Southern Africa.pdf") type <- "report"
if (basename(x) == "Bosello et al. 2014 (documentation damage function WITCH model).pdf") type <- "working paper"
if (basename(x) == "Bosetti et al. 2009 The 2008 WITCH Model New Model Features and Baseline.pdf") type <- "working paper"
if (basename(x) == "Clarke et al 2007.pdf") type <- "report"
if (basename(x) == "Cilliers 2013 Assessing Long-Term State Fragility in Africa Prospects for 26 ‘More Fragile’ Countries.pdf") type <- "report"
if (basename(x) == "Cilliers 2011 African Futures 2050 The Next 40 Years.pdf") type <- "report"
if (basename(x) == "Chateau 2011 Employment Impacts of Climate Change Mitigation Policies in OECD.pdf") type <- "working paper"
if (basename(x) == "Cave et al. 2009 Trends in Connectivity Technologies and their Socioeconomic Impacts.pdf") type <- "report"
if (basename(x) == "Cantore 2011 Future Paths of Poverty A Scenario Analysis with Integrated Assessment Models.pdf") type <- "working paper"
if (basename(x) == "Burt et al. 2014 Eradicating poverty in fragile states prospects of reaching the high-hanging fruit by 2030.pdf") type <- "working paper"
if (basename(x) == "Burrows 2016 Reducing the Risks from Rapid Demographic Change.pdf") type <- "report"
if (basename(x) == "Goedkoop et al. 2014 ReCiPe 2008 (version 1.08)—report 1 characterisation.pdf") type <- "report"
if (basename(x) == "Esbaugh et al. 2011 Taps and Toilets How Greater Access Can Radically Improve Africa’s Future.pdf") type <- "working paper"
if (basename(x) == "Durand-Lasserve et al. 2015 Modelling of distributional impacts of energy subsidy reforms.pdf") type <- "working paper"
if (basename(x) == "Dellink 2017 International trade consequences of climate change.pdf") type <- "working paper"
if (basename(x) == "Dellink 2014 Consequences of Climate Change Damages for Economic Growth.pdf") type <- "working paper"
if (basename(x) == "Dellink 2010 Costs, Revenues, and Effectiveness of the Copenhagen Accord Emission Pledges for 2020.pdf") type <- "working paper"
if (basename(x) == "Hughes 2007 Forecasting Global Economic Growth with Endogenous Multifactor Productivity The International Futures (IFs) Approach.pdf") type <- "working paper"
if (basename(x) == "Hughes 2006 UNEP GEO4 Driver Scenarios Using IFS with Pardee.pdf") type <- "working paper"
if (basename(x) == "Hughes 2006 Assessing the Credibility of Forcasts using International Futures (IFs) Verification and Validation.pdf") type <- "working paper"
if (basename(x) == "Hughes 2005 Scenario Analysis with International Futures") type <- "working paper"
if (basename(x) == "Hughes 2004 The Base Case of International Futures (IFs) Comparison with Other Forecasts.pdf") type <- "working paper"
if (basename(x) == "Hughes 2004 Forecasting the Human Development Index.pdf") type <- "working paper"
if (basename(x) == "Hilderink et al. 2008 Towards a Global Integrated Sustainability Model GISMO1.0 status report.pdf") type <- "report, model documentation"
if (basename(x) == "Hedden et al. 2016 Ending hunger in Africa The elimination of hunger and food insecurity on the African by 2025 Conditions for success.pdf") type <- "report"
if (basename(x) == "Hedden et al. 2013 Fracking for Shale Gas in South Africa Blessing or Curse.pdf") type <- "working paper"
if (basename(x) == "Havlík et al. 2015 Global climate change, food supply and livestock production systems A bioeconomic analysis.pdf") type <- "book chapter"
if (basename(x) == "Havlik et al. 2015 Climate change impacts and mitigation in the developing world an integrated assessment of the agriculture and forestry sectors.pdf") type <- "working paper"
if (basename(x) == "Hughes 2008 Assessing Strategies for Reducing Global Poverty.pdf") type <- "working paper"
if (basename(x) == "Lanzi et al. 2013 Addressing Competitiveness and Carbon Leakage Impacts Arising from Multiple Carbon Markets.pdf") type <- "working paper"
if (basename(x) == "Labat et al. 2015 GECO2015 Global Energy and Climate Outlook Road to Paris Assessment of Low Emission Levels under World Action Integrating National Contributions.pdf") type <- "report"
if (basename(x) == "Kok et al. (2014) How sectors can contribute to sustainable use and conservation of biodiversity.pdf") type <- "report"
if (basename(x) == "Kiesewetter et al. 2016 A Scalable Approach to Modelling Health Impacts of Air Pollution Based on Globally Available Data.pdf") type <- "abstract"
if (basename(x) == "Irfan 2012 SADC Higher Education Futures 2050.pdf") type <- "report"
if (basename(x) == "International Council for Science (2017).pdf") type <- "report"
if (basename(x) == "International Commission on Financing Global Education Opportunity 2016 The Learning Generation Investing in Education for a Changing World.pdf") type <- "report"
if (basename(x) == "IDDRI (2014) DDPP_report.pdf") type <- "report"
if (basename(x) == "Hughes et al. 2011 Vulnerability to Interstate Conflict Evaluating Quantitative Measures.pdf") type <- "report"
if (basename(x) == "Hughes et al. 2011 Projections of Global Health Outcomes from 2005 to 2060 Using the International Futures Integrated Forecasting Tool.pdf") type <- "working paper"
if (basename(x) == "Hughes et al. 2011 Forecasting the Impacts of Environmental Constraints on Human Development.pdf") type <- "report"
if (basename(x) == "Hughes 2013 Development-Oriented Policies and Alternative Human Development Paths Aggressive but Reasonable Interventions.pdf") type <- "report"
if (basename(x) == "Hughes 2009 Economic Futures and Their Implications for Global Development The Unfolding of the Great Recession.pdf") type <- "working paper"
if (basename(x) == "OECD 2012 OECD Environmental Outlook to 2050 consequences of inaction.pdf") type <- "report"
if (basename(x) == "OECD 2016 Policy-Highlights-Economic-consequences-of-outdoor-air-pollution-web.pdf") type <- "report"
if (basename(x) == "OECD 2015 The Economic Consequences of Climate Change.pdf") type <- "report"
if (basename(x) == "OECD 2009 The Economics of Climate Change Mitigation Policies and Options for Global Action Beyond 2012_ES.pdf") type <- "report (executive summary only)"
if (basename(x) == "OECD 2008 OECD Environmental Outlook to 2030.pdf") type <- "report"
if (basename(x) == "Nelson et al. 2010 Food security, farming, and climate change to 2050, Scenarios, results, policy options.pdf") type <- "report"
if (basename(x) == "Narayan 2016 Envisioning a Healthy Future Africa's Shifting Burden of Disease") type <- "working paper"
if (basename(x) == "Moyer et al. 2015 Advancing development in Uganda evaluating policy choices for 2016-21 and selected impacts to 2040.pdf") type <- "working paper"
if (basename(x) == "Moyer 2012 Cultivating the Future Exploring the Potential and Impact of a Green Revolution in Africa.pdf") type <- "working paper"
if (basename(x) == "Messner 1995 User's guide for MESSAGE III.pdf") type <- "model documentation"
if (basename(x) == "Marczak 2016 Latin America and the Caribbean 2030 Future Scenarios.pdf") type <- "report"
if (basename(x) == "Shepherd et al. 2014 The Chronic Poverty Report 2014-2015 The Road to Zero Extreme Poverty.pdf") type <- "report"
if (basename(x) == "Shepherd et al. 2013 The Geography of Poverty, Disasters and Climate Extremes in 2030.pdf") type <- "report"
if (basename(x) == "Scott et al. 2017 Modeling Artificial Intelligence and Exploring its Impact.pdf") type <- "working paper"
if (basename(x) == "Riahi et al. 2012 GEA-Summary-web.pdf") type <- "report (executive summary only)"
if (basename(x) == "Riahi et al. 2012 GEA_Chapter17_pathways_lowres.pdf") type <- "book chapter"
if (basename(x) == "PSI 2012 (MergeDescription).pdf") type <- "model documentation"
if (basename(x) == "PSI 2014 (MergeCalibration).pdf") type <- "model documentation"
if (basename(x) == "pbl-2012-roads-from-rio-pathways-to-achieve-global-sustainability-goals-by-2050.pdf") type <- "report"
if (basename(x) == "PBL 2011 The protein puzzle the consumption and production of meat, dairy and fish in the European Union.pdf") type <- "report"
if (basename(x) == "PBL 2010 Rethinking Global Biodiversity Strategies Exploring structural changes in production and consumption to reduce biodiversity loss.pdf") type <- "report"
if (basename(x) == "PBL 2009 Beyond 2015 Long-term development and the Millennium Development Goals.pdf") type <- "report"
if (basename(x) == "PBL (2017) People_and_the_Earth_WEB.pdf") type <- "report"
if (basename(x) == "WMO_UNEP(2011).pdf") type <- "report"
if (basename(x) == "Van Vuuren 2007 Energy systems and climate policy - Long-term scenarios for an uncertain future.pdf") type <- "report"
# Comments
if (basename(x) == "Alkemade et al. 2011 Applying GLOBIO at different geographical levels.pdf") comment <- "uncomplete"
if (basename(x) == "Hayashi et al. 2014 Evaluation of Global Energy Crop Production Potential up to 2100 under Socioeconomic Development and Climate Change Scenarios.pdf") comment <- "japanese text"
if (basename(x) == "Kiesewetter et al. 2016 A Scalable Approach to Modelling Health Impacts of Air Pollution Based on Globally Available Data.pdf") comment <- "uncomplete, abstract only"
if (basename(x) == "Hughes et al. 2014 Opportunities and Challenges of a World with Negligible Senescence.pdf") comment <- "duplicated"
if (basename(x) == "Hughes et al. 2011 Projections of Global Health Outcomes from 2005 to 2060 Using the International Futures Integrated Forecasting Tool.pdf") comment <- "japanese, french, arabic, russian text"
if (basename(x) == "OECD 2009 The Economics of Climate Change Mitigation Policies and Options for Global Action Beyond 2012_ES.pdf") comment <- "uncomplete"
if (basename(x) == "Riahi et al. 2012 GEA-Summary-web.pdf") comment <- "uncomplete"
if (basename(x) == "Riahi et al. 2012 GEA_Chapter17_pathways_lowres.pdf") comment <- "uncomplete"
if (basename(x) == "Zhou et al. 2017 An Analysis on Hypothetical Shocks Representing Cooling Water Shortage Using a Computable General Equilibrium Model.pdf") comment <- "research gate paper"
if (basename(x) == "Yillia P (2015)") comment <- "german text"
data.frame(
author = au,
year = py,
type = type,
comment = comment,
filename = basename(x),
path = x
)
}) %>%
do.call("rbind", .)
v_pdf_summary_wYears <- inner_join(
v_pdf_years,
v_pdf_summary %>%
select(title,doi,so,pages,path),
by=c("path")
) %>%
select(author, year, type, title, doi, so, pages, comment, filename, path) %>%
mutate(year = as.numeric(paste(year)))
v_pdf_summary_wYears <- v_pdf_summary_wYears %>%
mutate(hasXML = ifelse(filename %in% substr(unique(basename(v_data_xml_df$doc)), 1, nchar(unique(basename(v_data_xml_df$doc)))-4), TRUE, FALSE))
v_data_xml <- read_xml(v_path_xmls[1])
v_data_xml <- lapply(v_path_xmls,
function(x) {
print(x)
#---- local function --------
get_mcatt_sectionheader <- function(i_df, i_attr, i_levels) {
tmp <- i_df %>%
filter(grepl("^[1-9]{1}(\\.{1}[1-9]{1}){,2}\\s{1}(\\w){1,}", text) & (maxy-miny) <= 20) %>%
filter(row_number() <= 5) %>%
select_(i_attr) %>%
group_by_(i_attr) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(desc(count)) %>%
filter(row_number() <= i_levels)
return(tmp[[i_attr]])
}
get_mcatt_bodytext <- function(i_df, i_attr) {
tmp <- i_df %>%
arrange(desc(ncar)) %>%
filter(row_number() <= 5) %>%
select_(i_attr) %>%
group_by_(i_attr) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(desc(count)) %>%
filter(row_number() == 1)
return(tmp[[i_attr]])
}
xml <- x %>%
readLines() %>%
paste(collapse="") %>%
enc2utf8() %>%
iconv() %>%
read_xml()
tmp <- xml_find_all(xml, "//page/paragraph[@role='section-heading' or
@role='unknown' or
@role='body-text' or
@role='figure-caption' or
@role='figure' or
@role='table-caption' or
@role='title' or
@role='reference-heading' or
@role='formula'
]")
if (length(tmp) != 0) {
print(" > Processing file...")
page <- xml_attr(tmp, "page")
minx <- xml_attr(tmp, "minX")
miny <- xml_attr(tmp, "minY")
maxx <- xml_attr(tmp, "maxX")
maxy <- xml_attr(tmp, "maxY")
mcfo <- xml_attr(tmp, "mostCommonFont")
mcfs <- xml_attr(tmp, "mostCommonFontsize")
sfon <- xml_attr(tmp, "startFont")
sfos <- xml_attr(tmp, "startFontsize")
efon <- xml_attr(tmp, "endFont")
efos <- xml_attr(tmp, "endFontsize")
mcco <- xml_attr(tmp, "mostCommonColor")
scol <- xml_attr(tmp, "startColor")
ecol <- xml_attr(tmp, "endColor")
role <- xml_attr(tmp, "role")
text <- xml_text(tmp)
df <- data.frame(
id = 1:length(text),
doc = x,
page = as.numeric(page),
minx = as.numeric(minx),
miny = as.numeric(miny),
maxx = as.numeric(maxx),
maxy = as.numeric(maxy),
mcfo = mcfo,
mcfs = as.numeric(mcfs),
sfon = sfon,
sfos = as.numeric(sfos),
efon = efon,
efos = as.numeric(efos),
mcco = mcco,
scol = scol,
ecol = ecol,
role = role,
text = text,
ncar = nchar(text),
stringsAsFactors = FALSE
)
#-- Infering text properties -------
# Infer most common font, font-size and colour used for section headers
print(" > Infer most common font for section headers")
section_header_mcfont <- get_mcatt_sectionheader(df, "mcfo", 2)
section_header_mcfontsize <- as.numeric(get_mcatt_sectionheader(df, "mcfs", 2))
section_header_mccolour <- get_mcatt_sectionheader(df, "mcco", 1)
# Infer most common font, font-size and colour used for body text
print(" > Infer most common font for body text")
body_text_mcfont <- get_mcatt_bodytext(df, "mcfo")
body_text_mcfontsize <- as.numeric(get_mcatt_bodytext(df, "mcfs"))
body_text_mccolour <- get_mcatt_bodytext(df, "mcco")
# Infer pdf content and structure (section headers and body text) by filtering out rows that do not match those common properties
print(" > Infer pdf structure and content")
df <- rbind(
# section-heading
df %>%
filter(grepl("^[1-9]{1}(\\.{1}[1-9]{1}){,2}\\s{1}(\\w){1,}", text) & (maxy-miny) <= 20) %>%
filter(mcfo %in% section_header_mcfont, mcfs %in% section_header_mcfontsize, mcco %in% section_header_mccolour) %>%
mutate(role="section-heading"),
# body-text
df %>%
filter(!grepl("^[1-9]{1}(\\.{1}[1-9]{1}){,2}\\s{1}(\\w){1,}", text) & (maxy-miny) > 20) %>%
filter(mcfo == body_text_mcfont, mcfs == body_text_mcfontsize, mcco == body_text_mccolour) %>%
mutate(role = "body-text")) %>%
arrange(id) %>%
# abstract
mutate(role=ifelse(grepl("^abstract.*", tolower(text)), "abstract", role))
#-- Sections allocation -------
# Allocate sections to rows #1
print(" > Allocate sections #1")
df <- df %>%
mutate(section = ifelse(role == "abstract", "abstract", "")) %>%
mutate(section = ifelse(role == "section-heading", text, ""))
# Allocate sections to rows #2
print(" > Allocate sections #2")
df$section[which(df$role == "body-text")] <- sapply((df %>%
select(id, role) %>%
filter(role == "body-text"))$id,
function(y) {
(df %>%
select(id, role, text) %>%
filter(role == "section-heading")
)$text[max(which((y - (df %>%
select(id, role) %>%
filter(role == "section-heading"))$id) > 0))]})
} else {
print("===========> Skipping <==================================================================")
df <- data.frame(
id = 1,
doc = x,
page = NA,
minx = NA,
miny = NA,
maxx = NA,
maxy = NA,
mcfo = "",
mcfs = NA,
sfon = "",
sfos = NA,
efon = "",
efos = NA,
mcco = "",
scol = "",
ecol = "",
role = "",
text = "",
ncar = 0,
section = "",
stringsAsFactors = FALSE
)
}
return(df)
})
v_data_xml_df <- v_data_xml %>%
do.call("rbind", .)
save(v_pdf_summary, v_data_xml, v_data_xml_df, file = "data/processedData_20171214.RData")
pdf_notextract <- v_pdf_summary$filename[which(!v_pdf_summary$filename %in% substr(unique(basename(v_data_xml_df$doc)), 1, nchar(unique(basename(v_data_xml_df$doc)))-4))]
cat("!! Warning !! Text in the following ", length(pdf_notextract)," pdfs could not be extracted at all:\n")
for (k in pdf_notextract) {
cat(paste0(" - ", k, "\n"))
}
v_data_xml_df_1doc_per_doc <- v_data_xml_df %>%
select(doc, role, text) %>%
filter(role == "body-text") %>%
group_by(doc) %>%
summarise(text = paste(text, collapse="")) %>%
ungroup()
v_data_xml_df_1doc_per_doc <- v_data_xml_df_1doc_per_doc %>%
mutate(text=tolower(text)) %>%
mutate(text=gsub("", ""))
#mutate(text=iconv(text, to="utf8"))
#---- Topic modelling --------------------------------------------------------------
ignorewords = c("the", "with")
# make a corpus
#corpus <- corporate(v_data_xml_df_1doc_per_doc, col="text")
ignoreWords <- c("the", "however", "this", "and")
corpus <- tm::Corpus(tm::VectorSource(v_data_xml_df_1doc_per_doc$text)) %>%
#tm::tm_map(tm::PlainTextDocument) %>%
tm::tm_map(tm::removePunctuation) %>%
tm::tm_map(tm::removeNumbers) %>%
tm::tm_map(tm::removeWords, tm::stopwords()) %>%
tm::tm_map(tm::removeWords, ignoreWords) %>%
tm::tm_map(tm::stemDocument)
#t <- iconv(t,to="utf-8-mac")
# make a doc-term matrix and refresh the corpus to reflect any docs removed in the process
dtm <- makeDTM(corpus,0.95,v_data_xml_df_1doc_per_doc$doc,0.001249,0)
rem <- filter(v_data_xml_df_1doc_per_doc, doc %in% dtm$removed)
docs_used <- subset(v_data_xml_df_1doc_per_doc, !(doc %in% dtm$removed))
corpus_used <- refresh_corp(dtm$dtm)
# save data
docs <- docs_used
corpus <- corpus_used
save(docs,corpus,dtm,file="data/docs_20171213.RData")
load("data/docs_20171213.RData")
rm(docs_used,corpus_used,rem)
SEED <- 2016
tm_sdg_20171213 <- list()
for (knt in c(25,27,30,32,35)) {
print(paste0("Running LDA with ", knt, " topics..."))
system.time({
tm_sdg_20171213[[paste(knt)]] = LDA(dtm$dtm,k=knt, method="VEM",
control=list(seed=SEED))
})
}
system.time({
tm_sdg[[paste(25)]] = LDA(dtm$dtm,k=25, method="VEM",
control=list(seed=SEED))
})
library("xlsx")
write.xlsx()
save(tm_sdg_20171213, file="data/lda_model_topics_25-35_20171213.RData")
file.copy("topic_summary_template.Rmd", "topic_summary.Rmd", overwrite = T)
rmarkdown::render("topic_summary.Rmd", params = list(
tms=tm_sdg
))
terms(LDA_model, 10) %>% as.data.frame() %>% write.csv2(file = "output/LDAmodel_60topics_crosssection.csv")
# saves the results into the working dir
save(LDA_model, file="data/lda_model_60topics.RData")
terms(LDA_model, 3)
for (knt in c(25,27,30,32,35)) {
print(paste0("Exporting top 10 terms for TM with ", knt, " topics..."))
terms(tm_sdg_20171213[[paste(knt)]], 10) %>%
as.data.frame() %>%
write.csv2(file = paste0("output/LDAmodel_20171213_", knt,"topics_crosssection.csv"))
}
visualise(LDA_model, corpus, dtm$dtm, dir="output/LDA_model")
k_tw = 1
VERBOSE=TRUE
# window topic modeling
wtm <- list()
wtm[[k_tw]] <- list()
# Select documents
wtm[[k_tw]]$data <- v_data_xml_df_1doc_per_doc #%>% filter(PY %in% as.numeric(tw[[k_tw]]))
cat(paste0("Number of documents in current time-window: ", nrow(wtm[[k_tw]]$data), "\n"))
# Generator iterator
wtm[[k_tw]]$it <- get_tokensIterator(wtm[[k_tw]]$data, ignorewords, verbose=VERBOSE)
# Create vocabulary
wtm[[k_tw]]$vocab <- create_vocab(wtm[[k_tw]]$it, term_count_min = 5L, verbose=VERBOSE)
# Create Document-Term Matrix
wtm[[k_tw]]$dtm <- generate_dtm(wtm[[k_tw]]$vocab, wtm[[k_tw]]$it, wtm[[k_tw]]$data, verbose=VERBOSE)
# Perform NMF to get topics
tm_nmf <- nmf(as.matrix(wtm[[k_tw]]$dtm), rank = 50, seed="nndsvd")
tm_lda <- LDA$new(n_topics = 60)
doc_topic_distr <- tm_lda$fit_transform(wtm[[k_tw]]$dtm, n_iter = 20)
tm_lda$plot()
#==== PLOT DATA ====================================================================