Skip to content

Commit

Permalink
Fields read from files
Browse files Browse the repository at this point in the history
  • Loading branch information
alberto-martin committed Sep 11, 2016
1 parent a9f4d23 commit 287a2ad
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 16 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ splits a field with multiple elements (authors, subject categories, addresses, e

## Update log

v0.6 The list of fields is now extracted from the exported files themselves, instead of from a predefined variable, since new fields may be added by WoS at any time, or old ones discontinued.

v0.5 Added split.c1 function to split the C1 WoS field (authors/addresses), split.field() and read.wos() wrappers

v0.4 Added split.field function to split author, category etc. fields. It creates a new table containing each element in a separate row, keeping its relationship to the original record through the ID column.
Expand Down
38 changes: 24 additions & 14 deletions read.wos.functions.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
library(data.table)

fields <- c('PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS',
'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'RP',
'EM', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'PU', 'PI', 'PA',
'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI',
'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'PG', 'WC', 'SC', 'GA', 'UT', 'AA',
'BB')

# This function will parse a list of WoS export files in Tab-delimited (Win, UTF-8) Format, and convert them
# to a data.table.

read.wos.tw8 <- function(path = './files', nrows=1000000L) {

# reads list of files
files <- list.files(path)
files <- list.files(path)

# Getting list of fields
fields <- readLines(files[1], n = 1)
fields <- substring(fields, 4)
fields <- strsplit(fields, "\t")[[1]]

# creates empty data.table
dt <- data.table(x=rep('0',nrows))
Expand Down Expand Up @@ -50,8 +49,6 @@ read.wos.tw8 <- function(path = './files', nrows=1000000L) {
}
# deletes unused rows and columns
dt <- dt[PT != '0']
dt <- dt[,AA:=NULL]
dt <- dt[,BB:=NULL]

# converts some variables to integer: NR (number of cited references),
# TC (times cited WoS),
Expand Down Expand Up @@ -80,6 +77,21 @@ clean_ut <- function(char_vec, split = " ") {strsplit(char_vec, split = split)[[
read.wos.plain <- function(path = './files', nrows=10000000L) {
files <- list.files(path)

# Getting list of fields
if (length(files) <= 10) {
lines <- unlist(sapply(files, readLines))
} else {
lines <- unlist(sapply(files[1:10], readLines))
}

#end_of_record <- which(grepl('ER', substr(lines,1,2)))[1]
lines <- lines[3:length(lines)]
fields <- unname(sapply(lines[startsWith(lines, " ") == F], substr, 1, 2))
fields <- unique(fields)
remove <- c("", "ER", "ï»", 'FN', 'VR')
fields <- fields[! fields %in% remove]
rm(lines, remove)

# creates empty data.table
dt <- data.table(x=rep('0',nrows))
l <- list(rep('0',length(fields)))
Expand Down Expand Up @@ -155,8 +167,6 @@ read.wos.plain <- function(path = './files', nrows=10000000L) {
}
# deletes unused rows
dt <- dt[PT != '0']
dt <- dt[,AA:=NULL]
dt <- dt[,BB:=NULL]

# cleans UT field
dt$UT <- sapply(dt$UT, clean_ut, split = ' ')
Expand Down Expand Up @@ -218,7 +228,7 @@ split.simple <- function(source_dt, idcol = 'UT', splitcol, delimiter = ';') {
}
}
# deletes unused rows
dt <- dt[UT != '0']
dt <- dt[dt[[1]] != '0']
dt
}

Expand Down Expand Up @@ -284,7 +294,7 @@ split.c1 <- function(source_dt, idcol = 'UT', splitcol = 'C1', delimiter = ';')
}
}
# deletes unused rows
dt <- dt[UT != '0']
dt <- dt[dt[[1]] != '0']
dt
}

Expand Down
4 changes: 2 additions & 2 deletions report.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ split.simple <- function(source_dt, idcol = source_dt[['UT']], splitcol, delimit
}
}
# deletes unused rows
dt <- dt[UT != '0']
dt <- dt[dt[[1]] != '0']
dt
}
Expand Down Expand Up @@ -343,7 +343,7 @@ split.c1 <- function(source_dt, idcol = source_dt[['UT']], splitcol = 'C1', deli
}
}
# deletes unused rows
dt <- dt[UT != '0']
dt <- dt[dt[[1]] != '0']
dt
}
Expand Down

0 comments on commit 287a2ad

Please sign in to comment.