diff --git a/README.md b/README.md index 44ef647..433a822 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ splits a field with multiple elements (authors, subject categories, addresses, e ## Update log +v0.6 The list of fields is now extracted from the exported files themselves, instead of from a predefined variable, since new fields may be added by WoS at any time, or old ones discontinued. + v0.5 Added split.c1 function to split the C1 WoS field (authors/addresses), split.field() and read.wos() wrappers v0.4 Added split.field function to split author, category etc. fields. It creates a new table containing each element in a separate row, keeping its relationship to the original record through the ID column. diff --git a/read.wos.functions.R b/read.wos.functions.R index 0b8d261..42a4acb 100644 --- a/read.wos.functions.R +++ b/read.wos.functions.R @@ -1,18 +1,17 @@ library(data.table) -fields <- c('PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', - 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'RP', - 'EM', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'PU', 'PI', 'PA', - 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', - 'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'PG', 'WC', 'SC', 'GA', 'UT', 'AA', - 'BB') - # This function will parse a list of WoS export files in Tab-delimited (Win, UTF-8) Format, and convert them # to a data.table. read.wos.tw8 <- function(path = './files', nrows=1000000L) { + # reads list of files - files <- list.files(path) + files <- list.files(path) + + # Getting list of fields + fields <- readLines(files[1], n = 1) + fields <- substring(fields, 4) + fields <- strsplit(fields, "\t")[[1]] # creates empty data.table dt <- data.table(x=rep('0',nrows)) @@ -50,8 +49,6 @@ read.wos.tw8 <- function(path = './files', nrows=1000000L) { } # deletes unused rows and columns dt <- dt[PT != '0'] - dt <- dt[,AA:=NULL] - dt <- dt[,BB:=NULL] # converts some variables to integer: NR (number of cited references), # TC (times cited WoS), @@ -80,6 +77,21 @@ clean_ut <- function(char_vec, split = " ") {strsplit(char_vec, split = split)[[ read.wos.plain <- function(path = './files', nrows=10000000L) { files <- list.files(path) + # Getting list of fields + if (length(files) <= 10) { + lines <- unlist(sapply(files, readLines)) + } else { + lines <- unlist(sapply(files[1:10], readLines)) + } + + #end_of_record <- which(grepl('ER', substr(lines,1,2)))[1] + lines <- lines[3:length(lines)] + fields <- unname(sapply(lines[startsWith(lines, " ") == F], substr, 1, 2)) + fields <- unique(fields) + remove <- c("", "ER", "ï»", 'FN', 'VR') + fields <- fields[! fields %in% remove] + rm(lines, remove) + # creates empty data.table dt <- data.table(x=rep('0',nrows)) l <- list(rep('0',length(fields))) @@ -155,8 +167,6 @@ read.wos.plain <- function(path = './files', nrows=10000000L) { } # deletes unused rows dt <- dt[PT != '0'] - dt <- dt[,AA:=NULL] - dt <- dt[,BB:=NULL] # cleans UT field dt$UT <- sapply(dt$UT, clean_ut, split = ' ') @@ -218,7 +228,7 @@ split.simple <- function(source_dt, idcol = 'UT', splitcol, delimiter = ';') { } } # deletes unused rows - dt <- dt[UT != '0'] + dt <- dt[dt[[1]] != '0'] dt } @@ -284,7 +294,7 @@ split.c1 <- function(source_dt, idcol = 'UT', splitcol = 'C1', delimiter = ';') } } # deletes unused rows - dt <- dt[UT != '0'] + dt <- dt[dt[[1]] != '0'] dt } diff --git a/report.Rmd b/report.Rmd index 2defba0..e0537b5 100644 --- a/report.Rmd +++ b/report.Rmd @@ -279,7 +279,7 @@ split.simple <- function(source_dt, idcol = source_dt[['UT']], splitcol, delimit } } # deletes unused rows - dt <- dt[UT != '0'] + dt <- dt[dt[[1]] != '0'] dt } @@ -343,7 +343,7 @@ split.c1 <- function(source_dt, idcol = source_dt[['UT']], splitcol = 'C1', deli } } # deletes unused rows - dt <- dt[UT != '0'] + dt <- dt[dt[[1]] != '0'] dt }