-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathFRE6871_Lecture_7.Rnw
5962 lines (5505 loc) · 223 KB
/
FRE6871_Lecture_7.Rnw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% FRE6871_Lecture_7
% Define knitr options
% !Rnw weave=knitr
% Set global chunk options
<<knitr_setup,include=FALSE,cache=FALSE>>=
library(knitr)
opts_chunk$set(prompt=TRUE, eval=FALSE, tidy=FALSE, strip.white=FALSE, comment=NA, highlight=FALSE, message=FALSE, warning=FALSE, size="tiny", fig.width=4, fig.height=4)
options(width=80, dev="pdf")
options(digits=3)
thm <- knit_theme$get("acid")
knit_theme$set(thm)
@
% Define document options
\documentclass[9pt]{beamer}
\DeclareMathSizes{8pt}{6pt}{6pt}{5pt}
\mode<presentation>
\usetheme{AnnArbor}
% \usecolortheme{whale}
% Uncover everything in a step-wise fashion
% \beamerdefaultoverlayspecification{<+->}
% tikz package for plotting and tables
\usepackage{tikz}
\usetikzlibrary{positioning}
\usepackage{array}
\usepackage{multirow}
% mathtools package for math symbols
% \usepackage{mathtools}
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage[latin1]{inputenc}
\usepackage{hyperref}
\usepackage{fancybox}
\usepackage{url}
\usepackage[backend=bibtex,style=alphabetic]{biblatex} % bibstyle=numeric
\usepackage{animate}
\usepackage{listings}
\usepackage{xcolor}
\definecolor{anti_flashwhite}{rgb}{0.95, 0.95, 0.96}
\definecolor{cmd_background}{rgb}{0.2, 0.2, 0.0}
\definecolor{vba_background}{rgb}{0.0, 0.0, 0.9}
% \bibliographystyle{amsalpha} % doesn't work
% \addbibresource{FRE_lectures.bib}
% \addbibresource[location=remote]{http://www.citeulike.org/user/jerzyp}
\renewcommand\bibfont{\footnotesize}
\renewcommand{\pgfuseimage}[1]{\scalebox{0.75}{\includegraphics{#1}}} % scale bib icons
\setbeamertemplate{bibliography item}[text] % set bib icons
% \setbeamertemplate{bibliography item}{} % remove bib icons
% \usepackage{enumerate}
% \let\emph\textbf
% \let\alert\textbf
% Define colors for hyperlinks
\definecolor{links}{HTML}{2A1B81}
\hypersetup{colorlinks=true,linkcolor=,urlcolor=links}
% Make url text scriptsize
\renewcommand\UrlFont{\scriptsize}
% Make institute text italic and small
\setbeamerfont{institute}{size=\small,shape=\itshape}
\setbeamerfont{date}{size=\small}
\setbeamerfont{block title}{size=\normalsize} % shape=\itshape
\setbeamerfont{block body}{size=\footnotesize}
% Title page setup
\title[FRE6871 Lecture\#7]{FRE6871 \texttt{R} in Finance}
\subtitle{Lecture\#7, Fall 2024}
\institute[NYU Tandon]{NYU Tandon School of Engineering}
\titlegraphic{\includegraphics[scale=0.2]{image/tandon_long_color.png}}
\author[Jerzy Pawlowski]{Jerzy Pawlowski \emph{\href{mailto:jp3900@nyu.edu}{jp3900@nyu.edu}}}
% \email{jp3900@poly.edu}
\date{October 21, 2024}
% \date{\today}
% \pgfdeclareimage[height=0.5cm]{university-logo}{engineering_long_white}
% \logo{\pgfuseimage{engineering_long_white}}
%%%%%%%%%%%%%%%
\begin{document}
%%%%%%%%%%%%%%%
\maketitle
%%%%%%%%%%%%%%%
\section{Writing and Reading Data from Files}
%%%%%%%%%%%%%%%
\subsection{Writing Text Strings}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{cat()} concatenates strings and writes them to standard output or to files.
\vskip1ex
\texttt{cat()} parses its argument character string and its escape sequences (\texttt{"\textbackslash{}"}), but doesn't return a value.
\vskip1ex
The function \texttt{print()} doesn't interpret its argument, and simply prints it to standard output and invisibly returns it.
\vskip1ex
Typing the name of an object in \texttt{R} implicitly calls \texttt{print()} on that object.
\vskip1ex
The function \texttt{save()} writes objects to compressed binary \texttt{.RData} files.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
cat("Enter\ttab") # Cat() parses backslash escape sequences
print("Enter\ttab")
textv <- print("hello")
textv # Print() returns its argument
# Create string
textv <- "Title: My Text\nSome numbers: 1,2,3,...\nRprofile files contain code executed at R startup,\n"
cat(textv, file="mytext.txt") # Write to text file
cat("Title: My Text", # Write several lines to text file
"Some numbers: 1,2,3,...",
"Rprofile files contain code executed at R startup,",
file="mytext.txt", sep="\n")
save(textv, file="mytext.RData") # Write to binary file
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Displaying Numeric Data}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{print()} displays numeric data objects, with the number of digits given by the global option \texttt{"digits"}.
\vskip1ex
The function \texttt{sprintf()} returns strings formatted from text strings and numeric data.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=TRUE>>=
print(pi)
print(pi, digits=10)
getOption("digits")
foo <- 12
bar <- "weeks"
sprintf("There are %i %s in the year", foo, bar)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Reading Text from Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{scan()} reads text or data from a file and returns it as a vector or a list.
\vskip1ex
The function \texttt{readLines()} reads lines of text from a connection (file or console), and returns them as a vector of \texttt{character} strings.
\vskip1ex
The function \texttt{readline()} reads a single line from the console, and returns it as a \texttt{character} string.
\vskip1ex
The function \texttt{file.show()} reads text or data from a file and displays in editor.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Read text from file
scan(file="mytext.txt", what=character(), sep="\n")
# Read lines from file
readLines(con="mytext.txt")
# Read text from console
inputv <- readline("Enter a number: ")
class(inputv)
# Coerce to numeric
inputv <- as.numeric(inputv)
# Read text from file and display in editor:
# file.show("mytext.txt")
# file.show("mytext.txt", pager="")
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading \protect\emph{Data Frames} from \protect\emph{Text} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The functions \texttt{write.table()} and \texttt{read.table()} write and read \emph{data frames} from text files.
\vskip1ex
\texttt{write.table()} coerces objects to \emph{data frames} before it writes them.
\vskip1ex
\texttt{read.table()} returns a \emph{data frame}, without coercing non-numeric values to \texttt{factors} (so no need for the option \texttt{stringsAsFactors=FALSE}).
\vskip1ex
\texttt{write.table()} and \texttt{read.table()} can be used to write and read matrices from text files, but they have to be coerced back to matrices.
\vskip1ex
\texttt{write.table()} and \texttt{read.table()} are inefficient for very large data sets.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
setwd("/Users/jerzy/Develop/lecture_slides/data")
dframe <- data.frame(type=c("rose", "daisy", "tulip"),
color=c("red", "white", "yellow"),
price=c(1.5, 0.5, 1.0),
row.names=c("flower1", "flower2", "flower3")) # end data.frame
matv <- matrix(sample(1:12), ncol=3,
dimnames=list(NULL, c("col1", "col2", "col3")))
rownames(matv) <- paste("row", 1:NROW(matv), sep="")
# Write data frame to text file, and then read it back
write.table(dframe, file="florist.txt")
readf <- read.table(file="florist.txt")
readf # A data frame
all.equal(readf, dframe)
# Write matrix to text file, and then read it back
write.table(matv, file="matrix.txt")
readmat <- read.table(file="matrix.txt")
readmat # write.table() coerced matrix to data frame
class(readmat)
all.equal(readmat, matv)
# Coerce from data frame back to matrix
readmat <- as.matrix(readmat)
class(readmat)
all.equal(readmat, matv)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Copying \protect\emph{Data Frames} Between the \protect\emph{clipboard} and \texttt{R}}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
\emph{Data frames} stored in the \emph{clipboard} can be copied into \texttt{R} using the function \texttt{read.table()}.
\vskip1ex
\emph{Data frames} in \texttt{R} can be copied into the \emph{clipboard} using the function \texttt{write.table()}.
\vskip1ex
This allows convenient copying of \emph{data frames} between \texttt{R} and \texttt{Excel}.
\vskip1ex
\emph{Data frames} can also be manipulated directly in the \texttt{R} spreadsheet-style data editor.
\vskip1ex
Copying and pasting between the \emph{clipboard} and \texttt{R} works well on Windows, but not on MacOS. There are some workarounds for MacOS:\\
\href{https://stackoverflow.com/questions/10959521/how-to-write-to-clipboard-on-ubuntu-linux-in-r}{\emph{Copy\_paste\_between\_R\_and\_clipboard}}
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Create a data frame
dframe <- data.frame(small=c(3, 5), medium=c(9, 11), large=c(15, 13))
# Launch spreadsheet-style data editor
dframe <- edit(dframe)
# Copy the data frame to clipboard
write.table(x=dframe, file="clipboard", sep="\t")
# Wrapper function for copying data frame from R into clipboard
# by default, data is tab delimited, with a header
write_clip <- function(data, namev=FALSE, col.names=TRUE, ...) {
write.table(x=data, file="clipboard", sep="\t",
row.names=namev, col.names=col.names, ...)
} # end write_clip
write_clip(data=dframe)
# Wrapper function for copying data frame from clipboard into R
# by default, data is tab delimited, with a header
read_clip <- function(file="clipboard", sep="\t", header=TRUE, ...) {
read.table(file=file, sep=sep, header=header, ...)
} # end read_clip
dframe <- read.table("clipboard", header=TRUE)
dframe <- read_clip()
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading \protect\emph{Data Frames} From \texttt{.csv} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The easiest way to share data between \texttt{R} and \texttt{Excel} is through \texttt{.csv} files.
\vskip1ex
The functions \texttt{write.csv()} and \texttt{read.csv()} write and read \emph{data frames} from \texttt{.csv} format files.
\vskip1ex
The functions \texttt{write.csv()} and \texttt{read.csv()} write and read \emph{data frames} from \texttt{.csv} format files.
\vskip1ex
These functions are \emph{wrappers} for \texttt{write.table()} and \texttt{read.table()}.
\vskip1ex
\texttt{read.csv()} doesn't coerce non-numeric values to \texttt{factors}, so no need for the option \texttt{stringsAsFactors=FALSE}.
\vskip1ex
\texttt{read.csv()} reads row names as an extra column, unless the \texttt{row.names=1} argument is used.
\vskip1ex
The argument \texttt{"row.names"} accepts either the number or the name of the column containing the row names.
\vskip1ex
The \texttt{*.csv()} functions are very inefficient for large data sets.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Write data frame to CSV file, and then read it back
write.csv(dframe, file="florist.csv")
readf <- read.csv(file="florist.csv")
readf # the row names are read in as extra column
# Restore row names
rownames(readf) <- readf[, 1]
readf <- readf[, -1] # Remove extra column
readf
all.equal(readf, dframe)
# Read data frame, with row names from first column
readf <- read.csv(file="florist.csv", row.names=1)
readf
all.equal(readf, dframe)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading \protect\emph{Data Frames} From \texttt{.csv} Files (cont.)}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The functions \texttt{write.csv()} and \texttt{read.csv()} can write and read \emph{data frames} from \texttt{.csv} format files \emph{without using row names}.
\vskip1ex
Row names can be omitted from the output file by calling \texttt{write.csv()} with the argument \texttt{row.names=FALSE}.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Write data frame to CSV file, without row names
write.csv(dframe, row.names=FALSE, file="florist.csv")
readf <- read.csv(file="florist.csv")
readf # A data frame without row names
all.equal(readf, dframe)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Reading Data From Very Large \texttt{.csv} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
Data from very large \texttt{.csv} files can be read in small chunks instead of all at once.
\vskip1ex
The function \texttt{file()} opens a connection to a file or an internet website \texttt{URL}.
\vskip1ex
The function \texttt{read.csv()} with the argument \texttt{"nrows"} reads only the specified number of rows from a connection and returns a \emph{data frame}. The connection pointer is reset to the next row.
\vskip1ex
The function \texttt{read.csv()} with the argument \texttt{"nrows"} allows reading data sequentially from very large files that wouldn't fit into memory.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Open a read connection to a file
filecon = file("/Users/jerzy/Develop/lecture_slides/data/etf_prices_crsp.csv", "r")
# Read the first 10 rows
data10 <- read.csv(filecon, nrows=10)
# Read another 10 rows
data20 <- read.csv(filecon, nrows=10, header=FALSE)
colnames(data20) <- colnames(data10)
# Close the connection to the file
close(filecon)
# Open a read connection to a file
filecon = file("/Users/jerzy/Develop/lecture_slides/data/etf_prices_crsp.csv", "r")
# Read the first 1000 rows
data10 <- read.csv(filecon, nrows=1e3)
colv <- colnames(data10)
# Write to a file
countv <- 1
write.csv(data10, paste0("/Users/jerzy/Develop/data/temp/etf_prices_", countv, ".csv"))
# Read remaining rows in a loop 10 rows at a time
# Can produce error without getting to end of file
while (isOpen(filecon)) {
datav <- read.csv(filecon, nrows=1e3)
colnames(datav) <- colv
write.csv(datav, paste0("/Users/jerzy/Develop/data/temp/etf_prices_", countv, ".csv"))
countv <- countv + 1
} # end while
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Matrices From \texttt{.csv} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The functions \texttt{write.csv()} and \texttt{read.csv()} can write and read matrices from \texttt{.csv} format files.
\vskip1ex
If row names can be omitted in the output file, then \texttt{write.csv()} can be called with argument \texttt{row.names=FALSE}.
\vskip1ex
If the input file doesn't contain row names, then \texttt{read.csv()} can be called without the \texttt{"row.names"} argument.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Write matrix to csv file, and then read it back
write.csv(matv, file="matrix.csv")
readmat <- read.csv(file="matrix.csv", row.names=1)
readmat # Read.csv() reads matrix as data frame
class(readmat)
readmat <- as.matrix(readmat) # Coerce to matrix
all.equal(readmat, matv)
write.csv(matv, row.names=FALSE,
file="matrix_ex_rows.csv")
readmat <- read.csv(file="matrix_ex_rows.csv")
readmat <- as.matrix(readmat)
readmat # A matrix without row names
all.equal(readmat, matv)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Matrices (cont.)}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
There are several ways of writing and reading matrices from \texttt{.csv} files, with tradeoffs between simplicity, data size, and speed.
\vskip1ex
The function \texttt{write.matrix()} writes a matrix to a text file, without its row names.
\vskip1ex
\texttt{write.matrix()} is part of package \emph{MASS}.
\vskip1ex
The advantage of function \texttt{scan()} is its speed, but it doesn't handle row names easily.
\vskip1ex
Removing row names simplifies the writing and reading of matrices.
\vskip1ex
The function \texttt{readLines} reads whole lines and returns them as single strings.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
setwd("/Users/jerzy/Develop/lecture_slides/data")
library(MASS) # Load package "MASS"
# Write to CSV file by row - it's very SLOW!!!
MASS::write.matrix(matv, file="matrix.csv", sep=",")
# Read using scan() and skip first line with colnames
readmat <- scan(file="matrix.csv", sep=",", skip=1,
what=numeric())
# Read colnames
colv <- readLines(con="matrix.csv", n=1)
colv # this is a string!
# Convert to char vector
colv <- strsplit(colv, split=",")[[1]]
readmat # readmat is a vector, not matrix!
# Coerce by row to matrix
readmat <- matrix(readmat, ncol=NROW(colv), byrow=TRUE)
# Restore colnames
colnames(readmat) <- colv
readmat
# Scan() is a little faster than read.csv()
library(microbenchmark)
summary(microbenchmark(
read_csv=read.csv("matrix.csv"),
scan=scan(file="matrix.csv", sep=",",
skip=1, what=numeric()),
times=10))[, c(1, 4, 5)] # end microbenchmark summary
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Reading Matrices Containing Bad Data}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
Very often data that is read from external sources contains elements with bad data.
\vskip1ex
An example of bad data are \texttt{character} strings within sets of \texttt{numeric} data.
\vskip1ex
Columns of numeric data that contain strings are coerced to \texttt{character} or \texttt{factor}, when they're read by \texttt{read.csv()}.
\vskip1ex
The function \texttt{as.numeric()} coerces complex data objects into \texttt{numeric} vectors, and removes all their \emph{attributes}.
\vskip1ex
\texttt{as.numeric()} coerces strings that don't represent numbers into \texttt{NA} values.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Read data from a csv file, including row names
matv <- read.csv(file="data/matrix_bad.csv", row.names=1)
matv
class(matv)
# Columns with bad data are character or factor
sapply(matv, class)
# Coerce character column to numeric
matv$col2 <- as.numeric(matv$col2)
# Or
# Copy row names
namev <- rownames(matv)
# sapply loop over columns and coerce to numeric
matv <- sapply(matv, as.numeric)
# Restore row names
rownames(matv) <- namev
# Replace NAs with zero
matv[is.na(matv)] <- 0
# matrix without NAs
matv
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Time Series From \protect\emph{Text} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The package \emph{zoo} contains functions \texttt{write.zoo()} and \texttt{read.zoo()} for writing and reading \emph{zoo} time series from \texttt{.txt} and \texttt{.csv} files.
\vskip1ex
The functions \texttt{write.zoo()} and \texttt{read.zoo()} are \emph{wrappers} for \texttt{write.table()} and \texttt{read.table()}.
\vskip1ex
The function \texttt{write.zoo()} writes the \emph{zoo} series index as a character string in quotations \texttt{""}, to make it easier to read (parse) by \texttt{read.zoo()}.
\vskip1ex
Users may also directly use \texttt{write.table()} and \texttt{read.table()}, instead of \texttt{write.zoo()} and \texttt{read.zoo()}.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=(-(1:4)),eval=FALSE>>=
setwd("/Users/jerzy/Develop/lecture_slides/data")
rm(list=ls())
# Initialize the random number generator
set.seed(1121, "Mersenne-Twister", sample.kind="Rejection")
library(zoo) # Load package zoo
# Create zoo with Date index
datev <- seq(from=as.Date("2013-06-15"), by="day",
length.out=100)
pricev <- zoo(rnorm(NROW(datev)), order.by=datev)
head(pricev, 3)
# Write zoo series to text file, and then read it back
write.zoo(pricev, file="pricev.txt")
pricezoo <- read.zoo("pricev.txt") # Read it back
all.equal(pricezoo, pricev)
# Perform the same using write.table() and read.table()
# First coerce pricev into data frame
dframe <- as.data.frame(pricev)
dframe <- cbind(datev, dframe)
# Write pricev to text file using write.table
write.table(dframe, file="pricev.txt",
row.names=FALSE, col.names=FALSE)
# Read data frame from file
pricezoo <- read.table(file="pricev.txt")
sapply(pricezoo, class) # A data frame
# Coerce data frame into pricev
pricezoo <- zoo::zoo(
drop(as.matrix(pricezoo[, -1])),
order.by=as.Date(pricezoo[, 1]))
all.equal(pricezoo, pricev)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Time Series From \texttt{.csv} Files}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
By default the functions \texttt{zoo::write.zoo()} and \texttt{zoo::read.zoo()} write data in \emph{space}-delimited text format, but they can also write to \emph{comma}-delimited \texttt{.csv} files by passing the parameter \texttt{sep=","}.
\vskip1ex
Single column \emph{zoo} time series usually don't have a dimension attribute, and they don't have a column name, unlike multi-column \emph{zoo} time series, and this can cause hard to detect bugs.
\vskip1ex
It's best to always pass the argument \texttt{"col.names=TRUE"} to the function \texttt{write.zoo()}, to make sure it writes a column name for a single column \emph{zoo} time series.
\vskip1ex
Reading a \texttt{.csv} file containing a single column of data using the function \texttt{read.zoo()} produces a \emph{zoo} time series with a \texttt{NULL} dimension, unless the argument \texttt{"drop=FALSE"} is passed to \texttt{read.zoo()}.
\vskip1ex
Users may also directly use \texttt{write.table()} and \texttt{read.table()}, instead of \texttt{write.zoo()} and \texttt{read.zoo()}.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=(-(1:1)),eval=FALSE>>=
library(zoo) # Load package zoo
# Write zoo series to CSV file, and then read it back
write.zoo(pricev, file="zooseries.csv", sep=",", col.names=TRUE)
pricezoo <- read.zoo(file="zooseries.csv",
header=TRUE, sep=",", drop=FALSE)
all.equal(pricev, drop(pricezoo))
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Time Series With \protect\emph{Date-time} Index}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{write.zoo()} writes \emph{zoo} time series into \texttt{.csv} files, but it doesn't format the time at midnight properly.
\vskip1ex
The function \texttt{write.table()} writes \emph{zoo} time series into \texttt{.csv} files, and it formats the time at midnight properly.
\vskip1ex
If the index of a \emph{zoo} time series is a \emph{date-time}, then \texttt{write.zoo()} writes the date and time fields as character strings separated by a \emph{space} between them, inside quotations \texttt{""}.
\vskip1ex
The functions \texttt{read.csv.zoo()} and \texttt{read.zoo()} read \emph{zoo} time series from \texttt{.csv} files.
\vskip1ex
Very often \texttt{.csv} files contain custom \emph{date-time} formats, which need to be passed as parameters into \texttt{read.zoo()} for proper formatting.
\vskip1ex
The \texttt{"FUN"} argument of \texttt{read.zoo()} accepts a function for coercing the date and time columns of the input data into a \emph{date-time} object suitable for the \emph{zoo} index.
\vskip1ex
The function \texttt{as.POSIXct()} coerces \texttt{character} strings into \texttt{POSIXct} \emph{date-time} objects.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=(-(1:1)),eval=FALSE>>=
# Initialize the random number generator
set.seed(1121, "Mersenne-Twister", sample.kind="Rejection")
# Create zoo with POSIXct date-time index
datev <- seq(from=as.POSIXct("2014-07-14"),
by="hour", length.out=100)
zooseries <- zoo(rnorm(NROW(datev)), order.by=datev)
head(zooseries, 3)
# Write zoo series to CSV file using write.zoo()
write.zoo(zooseries, file="zooseries.csv", sep=",", col.names=TRUE)
# Read from CSV file using read.csv.zoo() - doesn't work
zooread <- read.csv.zoo(file="zooseries.csv", header=FALSE,
format="%Y-%m-%d %H:%M:%S", tz="America/New_York")
# Read from CSV file using read.zoo() - error
zooread <- read.zoo(file="zooseries.csv", header=FALSE,
sep=",", FUN=as.POSIXct, format="%Y-%m-%d %H:%M:%S")
# Write zoo series to CSV file using write.table()
write.table(zooseries, file="zooseries.csv", sep=",",
row.names=TRUE, col.names=FALSE)
# Read from CSV file using read.zoo() with format argument
zooread <- read.zoo(file="zooseries.csv", header=FALSE,
sep=",", FUN=as.POSIXct, format="%Y-%m-%d %H:%M:%S")
all.equal(zooseries, zooread) # Works
# Coerce zoo series into data frame with custom date format
dframe <- as.data.frame(zooseries)
rownames(dframe) <- format(index(zooseries), format="%m-%d-%Y %H:%M:%S")
# Write zoo series to csv file using write.table
write.table(dframe, file="zooseries.csv", sep=",",
row.names=TRUE, col.names=FALSE)
# Read from CSV file using read.zoo()
zooread <- read.zoo(file="zooseries.csv", header=FALSE, sep=",",
FUN=as.POSIXct, format="%m-%d-%Y %H:%M:%S")
all.equal(zooseries, zooread) # Works
# Or using read.csv.zoo()
zooread <- read.csv.zoo(file="zooseries.csv", header=FALSE,
format="%m-%d-%Y %H:%M:%S", tz="America/New_York")
head(zooread, 3)
all.equal(zooseries, zooread, check.attributes=FALSE) # Works
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Reading Time Series With \texttt{Numeric} \protect\emph{Date-time} Index}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
If the index of a time series is \texttt{numeric} (representing the \emph{moment of time}, either as the number of days or seconds), then it must be coerced to a proper \emph{date-time} class.
\vskip1ex
A convenient way of reading time series with a numeric index is by using \texttt{read.table()}, and then coercing the \emph{data frame} into a time series.
\vskip1ex
The function \texttt{as.POSIXct.numeric()} coerces a \texttt{numeric} value representing the \emph{moment of time} into a \texttt{POSIXct} \emph{date-time}, equal to the \emph{clock time} in the local \emph{time zone}.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Read time series from CSV file, with numeric date-time
datazoo <- read.table(file="/Users/jerzy/Develop/lecture_slides/data/es_ohlc.csv",
header=TRUE, sep=",")
# A data frame
class(datazoo)
sapply(datazoo, class)
# Coerce data frame into xts series
datazoo <- xts::xts(as.matrix(datazoo[, -1]),
order.by=as.POSIXct.numeric(datazoo[, 1], tz="America/New_York",
origin="1970-01-01"))
# An xts series
class(datazoo)
head(datazoo, 3)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Passing Arguments to the \texttt{save()} Function}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{save()} writes objects to a binary file.
\vskip1ex
Object names can be passed into \texttt{save()} either through the \texttt{"..."} argument, or the \texttt{"list"} argument.
\vskip1ex
Objects passed through the \texttt{"..."} argument are not evaluated, so they must be either object names or character strings.
\vskip1ex
Object names aren't surrounded by quotes \texttt{""}, while character strings that represent object names are surrounded by quotes \texttt{""}.
\vskip1ex
Objects passed through the \texttt{"list"} argument are evaluated, so they may be variables containing character strings.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=(-1)>>=
rm(list=ls()) # Delete all objects in workspace
var1 <- 1; var2 <- 2
ls() # List all objects
ls()[1] # List first object
args(save) # List arguments of save function
# Save "var1" to a binary file using string argument
save("var1", file="my_data.RData")
# Save "var1" to a binary file using object name
save(var1, file="my_data.RData")
# Save multiple objects
save(var1, var2, file="my_data.RData")
# Save first object in list by passing to "..." argument
# ls()[1] is not evaluated
save(ls()[1], file="my_data.RData")
# Save first object in list by passing to "list" argument
save(list=ls()[1], file="my_data.RData")
# Save whole list by passing it to the "list" argument
save(list=ls(), file="my_data.RData")
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Lists of Objects}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{load()} reads data from \texttt{.RData} files, and \emph{invisibly} returns a vector of names of objects created in the workspace.
\vskip1ex
The vector of names can be used to manipulate the objects in loops, or to pass them to functions.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
rm(list=ls()) # Delete all objects in workspace
# Load objects from file
loadobj <- load(file="my_data.RData")
loadobj # vector of loaded objects
ls() # List objects
# Assign new values to objects in global environment
sapply(loadobj, function(symboln) {
assign(symboln, runif(1), envir=globalenv())
}) # end sapply
ls() # List objects
# Assign new values to objects using for loop
for (symboln in loadobj) {
assign(symboln, runif(1))
} # end for
ls() # List objects
# Save vector of objects
save(list=loadobj, file="my_data.RData")
# Remove only loaded objects
rm(list=loadobj)
# Remove the object "loadobj"
rm(loadobj)
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Saving Output of \texttt{R} to a File}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The function \texttt{sink()} diverts \texttt{R} \emph{text} output (excluding graphics) to a file, or ends the diversion.
\vskip1ex
Remember to call \texttt{sink()} to end the diversion!
\vskip1ex
The function \texttt{pdf()} diverts graphics output to a \emph{pdf} file (text output isn't diverted), in vector graphics format.
\vskip1ex
The functions \texttt{png()}, \texttt{jpeg()}, \texttt{bmp()}, and \texttt{tiff()} divert graphics output to graphics files (text output isn't diverted).
\vskip1ex
The function \texttt{dev.off()} ends the diversion.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
sink("sinkdata.txt")# Redirect text output to file
cat("Redirect text output from R\n")
print(runif(10))
cat("\nEnd data\nbye\n")
sink() # turn redirect off
pdf("Rgraph.pdf", width=7, height=4) # Redirect graphics to pdf file
cat("Redirect data from R into pdf file\n")
myvar <- seq(-2*pi, 2*pi, len=100)
plot(x=myvar, y=sin(myvar), main="Sine wave",
xlab="", ylab="", type="l", lwd=2, col="red")
cat("\nEnd data\nbye\n")
dev.off() # turn pdf output off
png("r_plot.png") # Redirect graphics output to png file
cat("Redirect graphics from R into png file\n")
plot(x=myvar, y=sin(myvar), main="Sine wave",
xlab="", ylab="", type="l", lwd=2, col="red")
cat("\nEnd data\nbye\n")
dev.off() # turn png output off
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\section{High Performance Data Management}
%%%%%%%%%%%%%%%
\subsection{Package \protect\emph{data.table} for High Performance Data Management}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
The package
\href{https://cran.r-project.org/web/packages/data.table/}{\emph{data.table}}
is designed for high performance data management.
\vskip1ex
The package \emph{data.table} implements \emph{data table} objects, which are a special type of \emph{data frame}, and an extension of the \emph{data frame} class.
\vskip1ex
\emph{Data tables} are faster and more convenient to work with than \emph{data frames}.
\vskip1ex
\emph{data.table} functions are optimized for high performance (speed), because they are written in \texttt{C++} and they perform operations by reference (in place), without copying data in memory.
\vskip1ex
Some of the attractive features of package \emph{data.table} are:
\begin{itemize}
\item Syntax is analogous to SQL,
\item Very fast writing and reading from files,
\item Very fast sorting and merging operations,
\item Subsetting using multiple logical clauses,
\item Columns of type \texttt{character} are never converted to factors,
\end{itemize}
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Install package data.table
install.packages("data.table")
# Load package data.table
library(data.table)
# Get documentation for package data.table
# Get short description
packageDescription("data.table")
# Load help page
help(package="data.table")
# List all datasets in "data.table"
data(package="data.table")
# List all objects in "data.table"
ls("package:data.table")
# Remove data.table from search path
detach("package:data.table")
@
The package \emph{data.table} has extensive documentation:\\
\hskip1em\url{https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html}\\
\hskip1em\url{https://github.com/Rdatatable/data.table/wiki}
\vskip1ex
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{\protect\emph{Data Table} Objects}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}
\begin{block}{}
\begin{columns}[T]
\column{0.5\textwidth}
\emph{Data table} objects are a special type of \emph{data frame}, and are derived from the class \texttt{data.frame}.
\vskip1ex
\emph{Data table} objects resemble databases, with columns of different types of data, and rows of records containing individual observations.
\vskip1ex
The function \texttt{data.table::data.table()} creates a \emph{data table} object.
\vskip1ex
\emph{Data table} columns can be referenced directly by their names (without quotes), and their rows can be referenced without a following comma.
\vskip1ex
When a \emph{data table} is printed (by typing its name) then only the top \texttt{5} and bottom \texttt{5} rows are displayed (unless \texttt{getOption("datatable.print.nrows")} is less than \texttt{100}).
\vskip1ex
The operator \texttt{.N} returns the number of observations (rows) in the \emph{data table}.
\vskip1ex
\emph{Data table} computations are usually much faster than equivalent \texttt{R} computations, but not always.
\column{0.5\textwidth}
\vspace{-1em}
<<echo=TRUE,eval=FALSE>>=
# Create a data table
library(data.table)
dtable <- data.table::data.table(
col1=sample(7), col2=sample(7), col3=sample(7))
# Print dtable
class(dtable); dtable
# Column referenced without quotes
dtable[, col2]
# Row referenced without a following comma
dtable[2]
# Print option "datatable.print.nrows"
getOption("datatable.print.nrows")
options(datatable.print.nrows=10)
getOption("datatable.print.nrows")
# Number of rows in dtable
NROW(dtable)
# Or
dtable[, NROW(col1)]
# Or
dtable[, .N]
# microbenchmark speed of data.table syntax
library(microbenchmark)
summary(microbenchmark(
dt=dtable[, .N],
rcode=NROW(dtable),
times=10))[, c(1, 4, 5)] # end microbenchmark summary
@
\end{columns}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%
\subsection{Writing and Reading Data Using Package \protect\emph{data.table}}
\begin{frame}[fragile,t]{\subsecname}
\vspace{-1em}