Fields read from files

alberto-martin · Sep 11, 2016 · 287a2ad · 287a2ad
1 parent a9f4d23
commit 287a2ad
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ splits a field with multiple elements (authors, subject categories, addresses, e
 
 ## Update log
 
+v0.6  The list of fields is now extracted from the exported files themselves, instead of from a predefined variable, since new fields may be added by WoS at any time, or old ones discontinued.
+
 v0.5	Added split.c1 function to split the C1 WoS field (authors/addresses), split.field() and read.wos() wrappers
 
 v0.4    Added split.field function to split author, category etc. fields. It creates a new table containing each element in a separate row, keeping its relationship to the original record through the ID column.

diff --git a/read.wos.functions.R b/read.wos.functions.R
@@ -1,18 +1,17 @@
 library(data.table)
 
-fields <- c('PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 
-            'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'RP', 
-            'EM', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'PU', 'PI', 'PA', 
-            'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 
-            'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'PG', 'WC', 'SC', 'GA', 'UT', 'AA', 
-            'BB')
-
 # This function will parse a list of WoS export files in Tab-delimited (Win, UTF-8) Format, and convert them
 # to a data.table.
 
 read.wos.tw8 <- function(path = './files', nrows=1000000L) {
+
   # reads list of files
-  files  <- list.files(path)  
+  files  <- list.files(path)
+
+  # Getting list of fields
+  fields <- readLines(files[1], n = 1)
+  fields <- substring(fields, 4)
+  fields <- strsplit(fields, "\t")[[1]]
 
   # creates empty data.table
   dt <- data.table(x=rep('0',nrows))
@@ -50,8 +49,6 @@ read.wos.tw8 <- function(path = './files', nrows=1000000L) {
   }
   # deletes unused rows and columns
   dt <- dt[PT != '0']
-  dt <- dt[,AA:=NULL]
-  dt <- dt[,BB:=NULL]
 
   # converts some variables to integer: NR (number of cited references),
   #                                     TC (times cited WoS),
@@ -80,6 +77,21 @@ clean_ut <- function(char_vec, split = " ") {strsplit(char_vec, split = split)[[
 read.wos.plain <- function(path = './files', nrows=10000000L) {
   files  <- list.files(path)
 
+  # Getting list of fields
+  if (length(files) <= 10) {
+    lines <- unlist(sapply(files, readLines))
+  } else {
+    lines <- unlist(sapply(files[1:10], readLines))
+  }
+
+  #end_of_record <- which(grepl('ER', substr(lines,1,2)))[1]
+  lines <- lines[3:length(lines)]
+  fields <- unname(sapply(lines[startsWith(lines, "  ") == F], substr, 1, 2))
+  fields <- unique(fields)
+  remove <- c("", "ER", "ï»", 'FN', 'VR')
+  fields <- fields[! fields %in% remove]
+  rm(lines, remove)
+
   # creates empty data.table
   dt <- data.table(x=rep('0',nrows))
   l  <- list(rep('0',length(fields)))
@@ -155,8 +167,6 @@ read.wos.plain <- function(path = './files', nrows=10000000L) {
   }
   # deletes unused rows
   dt <- dt[PT != '0']
-  dt <- dt[,AA:=NULL]
-  dt <- dt[,BB:=NULL]
 
   # cleans UT field
   dt$UT  <- sapply(dt$UT, clean_ut, split = ' ')
@@ -218,7 +228,7 @@ split.simple <- function(source_dt, idcol = 'UT', splitcol, delimiter = ';') {
     }
   }  
   # deletes unused rows
-  dt <- dt[UT != '0']
+  dt <- dt[dt[[1]] != '0']
   dt
 }
 
@@ -284,7 +294,7 @@ split.c1 <- function(source_dt, idcol = 'UT', splitcol = 'C1', delimiter = ';')
     }
   }  
   # deletes unused rows
-  dt <- dt[UT != '0']
+  dt <- dt[dt[[1]] != '0']
   dt
 }
 

diff --git a/report.Rmd b/report.Rmd
@@ -279,7 +279,7 @@ split.simple <- function(source_dt, idcol = source_dt[['UT']], splitcol, delimit
     }
   }  
   # deletes unused rows
-  dt <- dt[UT != '0']
+  dt <- dt[dt[[1]] != '0']
   dt
 }
 
@@ -343,7 +343,7 @@ split.c1 <- function(source_dt, idcol = source_dt[['UT']], splitcol = 'C1', deli
     }
   }  
   # deletes unused rows
-  dt <- dt[UT != '0']
+  dt <- dt[dt[[1]] != '0']
   dt
 }