-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_ngrams_for_WOS_data.R
83 lines (70 loc) · 3.82 KB
/
compute_ngrams_for_WOS_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Code for obtaining ngrams of size 4, 3, and 2 from Web of Science data.
# Tokyo Institute of Technology, Graduate School of Environment and Society, Kajikawa Laboratory
# cristianmejia00@gmail.com
# 20200418 Distribution version.
# Input:
# A dataset file from the Web of Science. With extensions .txt .tsv or .csv
# Output:
# 3 .csv files that can be opened in Excel, one file for each 4, 3, and 2 keywords-long ngrams.
# Notes:
# This code was developed for files downloaded from the Web of Science.
# If multiple files were downloaded, please merge them firts using other codes or Excel.
# If you plan to use this code with other files, like patents from Derwent Innovation, you need to change the column names in the line 47 below.
# This file will shown several "warning" messages. These are not errors. Please just ignore the warnings.
# This code was tested in Windows 10. Don't know if works in MAC or Linux.
#######################################################################
# Call libraries
#######################################################################
# Note: the first time you run this in your PC it might take a while.
if(!"plyr" %in% rownames(installed.packages())) {install.packages("plyr")}
if(!"dplyr" %in% rownames(installed.packages())) {install.packages("dplyr")}
if(!"data.table" %in% rownames(installed.packages())) {install.packages("data.table")}
if(!"tm" %in% rownames(installed.packages())) {install.packages("tm")}
if(!"ngram" %in% rownames(installed.packages())) {install.packages("ngram")}
library(plyr)
library(dplyr)
library(data.table)
library(tm)
library(ngram)
#######################################################################
# Read the data
#######################################################################
# To read a file from Web of Science (.txt) in your local machine in Windows use this, and select your file
dataset <- fread(file.choose(), stringsAsFactors = FALSE)
# Or use my sample (Remove the "#" symbol in the next line)
#dataset <- fread('https://raw.githubusercontent.com/cristianmejia00/kajikawa_lab/master/test_data_WOS.csv', stringsAsFactors = FALSE, fill = TRUE)
#######################################################################
# Prepare the text
#######################################################################
# In this secuence:
# -- Unify Title and Abstract
# ---- Convert text to "tm" object
# ------ To lowercase
# ------ Remove stopwords
# ------ Remove numbers and symbols
# ------ Remove extra whitespaces
# ---- Convert "tm" object to text
documents <- paste(dataset$TI, dataset$AB, sep = ". ")
text <- Corpus(VectorSource(documents)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
text <- unlist(sapply(1:length(text), function(x){return(text[[x]]$content)}))
#######################################################################
# Obtain ngrams and write reports
#######################################################################
# ngrams are computed with the text of all articles as one single string.
bulk_text <- paste(text, collapse = " ")
# Get the ngrams for 4, 3, and 2 words
# Report only those ngrams appearing 3 times or more in all the dataset.
ngram4 <- get.phrasetable(ngram(bulk_text, n = 4)) %>% .[.$freq >= 3,]
ngram3 <- get.phrasetable(ngram(bulk_text, n = 3)) %>% .[.$freq >= 3,]
ngram2 <- get.phrasetable(ngram(bulk_text, n = 2)) %>% .[.$freq >= 3,]
# Write files
write.csv(ngram4, file="ngram4.csv", row.names = FALSE)
write.csv(ngram3, file="ngram3.csv", row.names = FALSE)
write.csv(ngram2, file="ngram2.csv", row.names = FALSE)
# Check folder, your files are located there:
getwd()