-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReport.Rmd
3010 lines (2476 loc) · 129 KB
/
Report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: \pagenumbering{gobble} <!-- ends page numbering "`r Sys.Date()`" -->\vspace{3.5in}Product Recommender Engine
subtitle: "Use Case: 'The MovieLens 10M dataset'"
date: "2019-02-27"
author: "Aurélien-Morgan"
output:
pdf_document:
toc: false
toc_depth: 2
number_sections: true
keep_tex: true
fig_width: 7
fig_height: 6
fig_caption: true
fig_crop: false
df_print: kable
highlight: tango
pandoc_args: [
"--natbib"
]
graphics: yes
fontsize: 10pt
linkcolor: blue
urlcolor: cyan
geometry: margin=1in
documentclass: article
classoption: twosided
header-includes:
- \hypersetup{
pdfauthor={Aurélien-Morgan},
pdftitle={},
pdfsubject={},
pdfkeywords={},
pdfproducer={MiKTeX pdfTeX with hyperref},
pdfcreator={PdfLaTeX},
bookmarksnumbered=true,
bookmarksopen=true,
bookmarksopenlevel=2,
pdfstartview=FitH,
pdfpagelayout=OneColumn,
unicode=true,
bookmarks=true,
pdfpagemode=UseOutlines,
pdfinfo={
CreationDate={D:20181229195600}
},
citecolor=magenta
}
# for 'vector' annotation in LaTex ; @see 'https://tex.stackexchange.com/questions/163279/how-can-i-type-formula-cosine-of-two-vectors-nice'
- \usepackage{esvect}
- \usepackage[table]{xcolor}
- \usepackage{wrapfig}
# fix the "! Undefined control sequence. l.115 \toprule" issue with the r 'kableExtra ' package ; @see 'https://github.com/rstudio/rmarkdown/issues/1384#issuecomment-400106730'
- \usepackage{booktabs}
# allow for portrait/lanscape pages switch ; @see 'https://stackoverflow.com/questions/25849814/rstudio-rmarkdown-both-portrait-and-landscape-layout-in-a-single-pdf#27334272'
- \usepackage{lscape}
- \usepackage{pdflscape}
- <!-- remove margin for figures and tables captions ; @see 'https://tex.stackexchange.com/questions/94016/how-to-reduce-space-between-image-and-its-caption#94018' -->
- \usepackage[font=small,skip=0pt]{caption}
- <!-- fix the "'fig' r tags fixed at top of pages" issue ; @see 'https://stackoverflow.com/questions/29696172/how-to-hold-figure-position-with-figure-caption-in-pdf-output-of-knitr#51608212' -->
- \usepackage{float}
- \floatplacement{figure}{H}
- \setlength{\columnsep}{18pt}
- \usepackage{multicol}
- <!-- fix to the multicol/pandoc compatibility issue ; @see 'https://stackoverflow.com/questions/40982836/latex-multicolumn-block-in-pandoc-markdown#41005796' -->
- \newcommand{\hideFromPandoc}[1]{#1}
- \hideFromPandoc{
\let\Begin\begin
\let\End\end
\let\Vspace\vspace
}
- \usepackage{fancyhdr}
- <!-- declare custom 'firststyle' (footer) -->
- \fancypagestyle{firststyle} {
\fancyhf{}
\newcommand {\changefont} {\fontsize{6}{8}\selectfont}
\pagestyle{fancy}
\renewcommand{\headrulewidth}{0pt}
\fancyfoot[LE, LO]{\begin{minipage}[c]{3cm}\end{minipage}}
\fancyfoot[CE, CO]{\begin{minipage}[c]{.6\textwidth}\changefont Harvard Executive Education – Data Science Professional Certification Program\\\begin{center}– Capstone Project 1 / 2 - mandatory topic -\end{center}\end{minipage}}
\IfFileExists{./harvardx_logo_100.jpg}{
\fancyfoot[RE, RO]{\begin{minipage}[c]{3cm}\includegraphics[]{./harvardx_logo_100.jpg}\end{minipage}}
}{
`r if( !file.exists( "./harvardx_logo_100.jpg" ) ) try( download.file(url = "https://www.edx.org/sites/default/files/upload/harvardx_logo_100.jpg", destfile = "harvardx_logo_100.jpg", mode = 'wb') )`
\fancyfoot[RE, RO]{\begin{minipage}[c]{3cm}\end{minipage}}
}
}
- \fancypagestyle{plain}{\pagestyle{firststyle}} <!-- apply custom 'firststyle' (footer) to cover page -->
- \setcounter{section}{-1} <!-- start numbering sections at '0' -->
- \renewcommand{\contentsname}{}\vspace{-0.5cm}
- \renewcommand{\listfigurename}{}\vspace{-0.5cm}
- \renewcommand{\listtablename}{}\vspace{-0.5cm}
- \renewcommand\refname{}\vspace{-0.5cm}
- \RequirePackage{filecontents}
biblio-style: unsrt <!-- to allow for 'numbered' citations/references -->
bibliography: Reportbib
params:
dummy: <!-- centralizes inputs -->
overallAccuracy: "98%"
rmse: "0.881 stars"
a_vector: !r c( "toto", "titi" )
---
<!--
amc_pdf_print()
-->
<!--
REQUIRES minimal MiKTeX installation
with 'upquote', 'natbib', 'filecontents', 'fancyhdr', 'multicol', 'float', 'caption', 'lscape', 'pdflscape', 'booktabs', 'wrapfig', 'xcolor' and 'esvect'
packages installed
-->
<!--
amc_pdf_print( paramsList = list(
overallAccuracy = "97.9%"
, rmse = "0.879 stars"
, a_vector = c( "tata", "tutu" ) ) )
-->
<style>
.main-container {
max-width: 120px !important;
}
</style>
<!--
tidy.opts=list(width.cutoff=60, tidy=TRUE)
-->
```{r global_options, R.options=knitr::opts_chunk$set( warning=FALSE, message=FALSE, echo = TRUE, dev = 'pdf', cache = FALSE, results=TRUE, include=TRUE, eval=TRUE ) }
```
```{r echo = FALSE, results = 'hide', include = FALSE, message = FALSE, warning = FALSE }
if( !require( tidyverse ) ) { install.packages( "tidyverse" ) } ; suppressWarnings(suppressMessages(suppressPackageStartupMessages(library( tidyverse, quietly = TRUE ))))
if( !require( data.table ) ) { install.packages( "data.table" ) } ; suppressWarnings(suppressMessages(suppressPackageStartupMessages(library( data.table, quietly = TRUE ))))
if( !require( matrixStats ) ) { install.packages( "matrixStats" ) } ; suppressWarnings(suppressMessages(suppressPackageStartupMessages(library( matrixStats, quietly = TRUE ))))
if( !require( gridExtra ) ) { install.packages( "gridExtra" ) } ; suppressWarnings(suppressMessages(suppressPackageStartupMessages(library( gridExtra, quietly = TRUE )))) # to organize plots
if( !require( knitr ) ) { install.packages( "knitr" ) } ; suppressWarnings(suppressMessages(suppressPackageStartupMessages(library( knitr, quietly = TRUE ))))
options( kableExtra.latex.load_packages = FALSE ) # !! VERY IMPORTNAT !!
if( !require( kableExtra ) ) { install.packages( "kableExtra " ) } # DO NOT LOAD !
```
```{r echo = FALSE }
# ensure there's a 'trained_model_instance' object for the running session :
if( !exists( "trained_model_instance" ) ) {
if( exists( "dev_trained_model_instance" ) ) {
# lazy evaluation ; renames the object without copying it
# @see 'https://stackoverflow.com/questions/22951811/how-to-rename-a-variable-in-r-without-copying-the-object'
trained_model_instance <- dev_trained_model_instance
suppressWarnings( rm( dev_trained_model_instance ) ) # warning raised due to 'scope' difference at report compile time.
if( exists( "dev_predictions" ) ) {
predictions <- dev_predictions
suppressWarnings( rm( dev_predictions ) )
} else {
predictions <- NULL
}
} else {
trained_model_instance <- NULL
predictions <- NULL
}
} else {
if( !exists( "predictions" ) ) {
predictions <- NULL
}
}
```
```{r echo = FALSE }
windowsFonts(
"Helvetica" = windowsFont( "Helvetica" ) )
theme_amc <- function( base_size_ = 9 ) {
# my custom ggplot2 theme
theme_bw( base_size = base_size_
, base_family = "Helvetica" ) %+replace%
theme(
panel.grid.minor = element_blank(), panel.border = element_blank()
, axis.line = element_line( colour = "black" )
)
}
theme_set( theme_amc() ) # set 'theme_amc' as the 'session' theme
#theme_set( theme_gray() ) # default ggplot2 theme
```
```{r echo = FALSE }
empty_message <-
paste( " Please have a trained model in memory\n",
" prior to compiling this report\n",
" (either 'trained_model_instance'\n",
" or 'dev_trained_model_instance')\n",
" and call the 'amc_pdf_print' function" )
empty_plot <- ggplot() +
annotate( "text", x = 4, y = 25
, size = 6.5, fontface = 'italic'
, label =
paste( " Empty figure\n"
, empty_message )
, color = "blue" ) +
theme( axis.line = element_blank()
, axis.text.x = element_blank()
, axis.text.y = element_blank()
, axis.ticks = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, legend.position = "none"
, panel.background = element_blank()
, panel.border = element_blank()
, panel.grid.major = element_blank()
, panel.grid.minor = element_blank()
, plot.background = element_blank() )
empty_table <- data.frame(
empty_message =
unlist( str_split(
paste( " Empty table\n"
, empty_message )
, "\n" ) ) )
rm( empty_message )
```
\newpage
# Table of Content {.unnumbered}
\tableofcontents
\vspace{1.5cm}
## List of Figures {.unnumbered}
\listoffigures
\vspace{1.5cm}
## List of Tables {.unnumbered}
\listoftables
\newpage
\pagenumbering{arabic} <!-- starts page numbering -->
<!--
amc_pdf_print()
-->
# Preamble
\Begin{multicols}{3}
Widespread across the internet, Recommender Agents have the ability to effectively learn user’s interests and help these potential consumers find their way to an ideal product among the vast variety of an offer. Service providers, often marketplaces, rely on them heavily.
Their main intended goal is to convert a “browser” into a “buyer” via assisted navigation, ease of use and enhanced user experience. This all brings to higher user retention. Recommending systems also allow detecting opportunities for cross sales and/or suggesting package deals, increasing the value of the average shopping basket.
\columnbreak
Recommending Systems are used by platforms, brands and advertisement agencies to generate improved overall user satisfaction, by making it more likely for each user to find quickly what he/she’s interested in. They are already applied to a variety of industries, ranging from retail and accommodation booking to video streaming, news feeding and even social networks [@socialNetworks]. Recommendation systems have been around for more than 25 years [@firstCollabFiltering]. Ever since, there have been many developments in the field, as for instance contemplated in this contemporary paper : [@twoDecades]. As the **Microsoft Research Lab – Asia** puts it, many efforts are of course continuously being made worldwide to even further develop them [@microsoftResearch] and those are watched closely by marketing and sales professionals eager to remain on top of the wolf pack. Recommender Engines have indeed proven to be very effective Marketing / Targeting tools.
\columnbreak
There are inherent complexities induced by the usage of Recommending Systems. The exciting topic of the ubiquity of recommender systems has for instance been covered by Ms. Lusi Li in her PhD thesis [@marketplaceEconomics] in the context of a dominant e-commerce platform that sells competing products from different manufacturers while simultaneously recommending a subset of these products. Ms. Li therein explores the intra-product-category competition as a function of products complementarity as well as the potential strategic price responses from manufacturers. Her observations are very interesting. You should check them out ! They can obviously be transposed to any industry.
\End{multicols}
---
\newpage
<!--
amc_pdf_print()
-->
# Executive Summary{#executiveSummaryAnchor}
\Begin{multicols}{3}
From 2006 to 2009, Netflix^TM^ sponsored a competition, offering a grand prize of $1,000,000 to the team that could take an offered dataset of over 100 million movie ratings and return recommendations that were 10% more accurate than those offered by the company's existing recommender system. The recommendation system relied on data consisting in a set of users having allotted a rating of up to 5 stars to each movie they had respectively watched. This competition energized the search for new and more accurate algorithms.
The Netflix^TM^ challenge winners were evaluated based on the ***R***oot ***M***ean ***S***quare ***E***rror of their model (***RMSE***)[^id_690].
For a set of “N” ratings, if we define “$y_u,i$” as the rating for movie “i” by user “u” and denote our prediction with “$\hat{y}_u,i$”, then it can be written as follows[^id_331] :
$$RMSE = \sqrt{ \frac{1}{N} \sum_{u,i} (y_{u,i} - \hat{y}_{u,i})^2 }$$
\columnbreak
On 21 September 2009, achieving a Test RMSE of 0.8567 stars, accounting for a 10.06% improvement over the original system, gave the victory to the “_**BellKor's Pragmatic Chaos**_” team.
A good summary of how the winning algorithm was put together can be read here : [@netflixWinSummary] and a more detailed explanation here : [@netflixBellKorenSolution].
This will be the focus of the present short paper : recommendation systems which, in order to make specific recommendations to users, use ratings that users have given items. Since the original data is not publicly available [@netflixPrivacyLawsuit], we here are going to work on data provided by the online movie recommender service "MovieLens" ('https://movielens.org/'). The entire latest 'MovieLens' dataset can be found online [@movieLensEntireDataset]. To make the computation a little easier, we will use the "10M" version of the 'MovieLens' dataset [@movieLens10MDataset], which consists of *“only”* 10 million ratings applied to 10,000 movies by 70,000 users and which was released in January 2009[^id_546].
\columnbreak
The brief introduction to Recommender Engines in the herein report will adopt characteristics of the model developed by the Netflix^TM^ challenge winning team, such as their original data analysis strategies. We’ll also use the RMSE as our loss function[^id_321].
The Netflix^TM^ challenge winners implemented two general classes of models. One was similar to "k-nearest neighbors", where they found "movies" that were similar to each other and "users" that were similar to each other. The other one was based on an approach called "matrix factorization". We'll cover a little of all that in the present document.
\End{multicols}
---
\clearpage
[^id_690]: the square root of mean error of the ***L***east ***S***quares ***E***stimates (LSE)
[^id_331]: variables that are estimates are marked with an hat "$\hat{}$".
[^id_546]: [@movieLens10MDatasetPaper]
[^id_321]: meaning, we’ll also develop a model so that it minimizes the RMSE.
<!--
amc_pdf_print()
-->
# A dive into the 'hows'
We’ll start by developing a **linear model** encompassing elements accounting for two different effects. These simple contributions to each individual user/movie rating consist in the fact that :
* on average, to a measurable extend, each given user rates above (or below) the average of users ; a.k.a the "**user effect**"
* on average, to a measurable extend, each given movie is rated above (or below) the average of movies ; a.k.a the "**movie effect**"
in that model, we’ll employ **regularization** techniques (think "Bayesian adjustments") by optimizing a select group of parameters of our model against the "Penalized Least Square". in layman's terms, we'll penalize (correct against) overly high ratings for movies that have thus far only collected so few ratings that these can not be considered as that representative (and we'll do the same for users which have provided few ratings to date).
$~$<!-- blank line (equation with single equation white space) -->
We’ll then **model the residuals** of that linear model thru reliance over movies **k nearest neighbors** (knn). There are two important aspects to this statement :
* _Nearest neighbors_ models work based on _distances/similarities_[^id_008]. In our case here, for each user, we'll assign ratings to movies based on ratings of _similarly rated_ movies. We'll measure "similarity" depending on how movies "compare" considering ratings distribution. For instance, to simplify, if movie "A" has been rated _3 stars_ by users "1", "2", "3" and "4", then the knn algorithm will predict that same rating to user "5" if all 5 users have rated movie "B" the same (say, _4 stars_).
* _Residuals_ are the _errors_ of the linear model, i.e. the difference between the predictions of that model and the actual ratings. if a user rates a movie _3.5 stars_ and if the linear model has predicted _3 stars_, then _0.5 stars_ is the _residual_ that will be passed on for the knn model to predict.
Together combined, these two models will constitute our first **stacked ensemble model**. What we're doing here is similar in spirit to _boosting_ as it indeed consists in having a subsequent model focus its prediction on the error (shortcomings) of a prior model. But similarities end there[^id_725]^,^ [^id_827].
$~$<!-- blank line (equation with single equation white space) -->
We’ll also apply a **matrix factorization** technique to the residuals of the initial regularized linear model. Together combined, those two will constitute our second stacked ensemble model.
Matrix factorization is the mathematical orthogonal transformation of the matrix of ratings (with movies as columns and users as rows). We'll use _Primary Components Analysis (pca)_ to decompose that matrix in order to achieve **Dimensionality reduction**. Pca allows us to summarize info relative to correlated users (think 'clusters') with a limited number of uncorrelated factors (called "Primary Components"). As an example, lets consider 2 highly correlated users.
Those 'uncorrelated factors' ; those "Primary Components" can for instance be thought of as characteristics of clusters of movies such as for example :
* whether or not a user appreciates movies directed by Luc BESSON[@lucBesson].
* whether or not a user appreciates movies starring Jean DUJARDiN[@jeanDujardin].
* whether or not a user appreciates movies involving a mix of romance and action.
* etc.
Matrix factorization algorithms permits us to detect patterns between clusters of users and clusters of movies, explaining for the variation without losing too much info (only 'noise') in the true ratings.
[^id_725]: most boosting algorithms consist of iteratively learning weak supervised models and adding them to finally constitute a strong supervised model. When they are added, they are typically weighted in some way that is usually related to the weak learners' accuracy [@boostWiki].
[^id_827]: model stacking introduced by Kaggle Competitions Grandmaster Marios MICHAILIDIS (KazAnova) : [@stackIntro][@stackIntroYoutube]
[^id_008]: the similarity measure adopted by the Netflix^TM^ challenge winning team was cosine. For details see the [knn treaining section](#knnAnchor).
\clearpage
<!--
amc_pdf_print()
-->
At this stage, we'll end up with the two below stacked model instances :
* Regularized linear + knn
* Regularized linear + pca
They will each provide predictions with a certain level of accuracy. To achieve an even better performance globally, we'll apply **bagging ensemble** with these two. Bagging[^id_620] involves averaging predictions in order to counterbalance individual models weaknesses ; achieve better predictive power ; converge towards true ratings.
$~$<!-- blank line (equation with single equation white space) -->
To train all these models, we’ll employ k-fold cross-validation[@crossValidationExplained] as a way to keep us safe from over-fitting[@overfitting] (low bias, high variance). Overfitting is this tendency models can have to reproduce training data too well, follow even variation induced by randomness in the training data and have no predictive power.
$~$<!-- blank line (equation with single equation white space) -->
[^id_620]:historically, Bagging came from the abbreviation of Bootstrap AGGregatING
---
\clearpage
<!--
amc_pdf_print()
-->
# For the little geek in you {#technicalAnchor}
Lets get a little technical. its gonna be brief and informative, I promise. I'll try to always remain engaging but if you have no interest in unveiling how to execute, even partly, the prediction procedure here depicted, you can simply skip over this section. There's much fun in reading the rest anyway !
The intent was to provide people with procedures and results that could fully be reproduced by anyone at home without any monetary investment and reliance over cloud computing, as long as they own a relatively modern PC.
The _R_ and _Python_ languages have that in common that objects they are using are hold in virtual memory. They both struggle at dealing natively with large ones. For that reason, the _movielens_ dataset that we employ here, with its 10 million records, is large enough that it requires particular measures to be taken for it to be manageable on a single laptop.
At our disposal, a machine hosting a Windows 10 Operating System with an "8th Gen. intel^®^ Core^TM^ i7 processor"[^id_726] (6 cores ; 12 logical processors), 256 GB of SSD storage on the same disk as the one hosting the O.S. and 16 GB of memory[^id_729]. We won't speak of other characteristics such as GPU as it is not exploited in the use case that concerns us here.
At its peak, the algorithm put together to generate this report requires a little over 60 gigabytes of memory. To circumvent that constraint, we need to extend the memory capacity for R and this is feasible by first increasing the O.S. paging size as explained here : [@windowsVirtualMemory]&[@windowsPartition].
if you can, set the maximum memory allocation for R at up to 128 GB. In addition to the above, it must also be informed at the 'R session' level with the below line of code :
```{r echo = TRUE, eval = FALSE }
memory.limit( size = 128000 )
```
On such a large dataset, in order to keep training duration at a manageable level on a laptop as the one depicted above, one has to draw on multi-threading/parallel processing whenever possible too. For matrix arithmetics, a first great level of gain can be achieved via switching from _standard R_ to _Microsoft R Open_ and _intel MKL_[@microsoftRopen][@multithreadingWindows][@whyRslow][@rstudioRversion][@32_64bitsRstudio].
It will however not save the day on all circumstances. When looking for performance optimization in R, options generally are 'vectorization' versus 'apply functions' and/or 'for loops' [@vectorization]. _Parallel 'foreach' looping_[^id_312] has for instance been investigated in the context of knn training which was by far the most computationally demanding part of the entire algorithm. However benefitial when working with ditributed systems, such parallel processing requires objects to be duplicated in memory on each thread it is running on, which means memory overload and, mostly, much time spent on reconciliating all sub-results at the end of the line[@foreachSlower][@foreachCombine][@rbindlist]. On non-ditributed systems, this can quickly become an overkill[@errorWrittingToConnection][@errorWrittingToConnection2].
Only when _Rcpp_ has been considered is it that a reasonnable processing time could be achieved for knn training. It litterally permitted to turn hours of training time into minutes.
The Rcpp package provides an API on top of R, permitting direct interchange of R objects between R and C++[^id_848]. That way, access is given to efficient memory pre-allocation to save from the burden of repetitive objects resizing as well as to the usage of pointers and references (addresses) to save from the hussle of unnecessarily overcrowding the memory. It's fairly easy to jumpstart Rcpp coding thanks to the excellent "**Rcpp for everyone**" by Masaki E. TSUDA[@rcppForEveryone]. These _documentation_ pages put together by the initial and main author of the package, Mr. Dirk EDDELBUETTE, can also be of some assistance : [@rcppSugarFunctions].
For the Rcpp package to be usable, a working C++ compiler is also needed in order to build binary packages. On Windows, you can install "Rtools" ('https://cran.r-project.org/bin/windows/Rtools/'). To check wether or not you have a working version of rtools associated to your R session, you can utilize the below R line of code :
```{r eval=FALSE}
devtools::find_rtools( debug = TRUE )
#Either a visible TRUE if rtools is found, or an invisible FALSE with a diagnostic message.
```
[^id_726]: All product and company names are trademarks^TM^ or registered^®^ trademarks of their respective holders.
[^id_729]: Gaming laptops are strong and make great Machine Learning platforms.
[^id_312]: packages used in that context were _foreach_, _parallel_, _doParallel_ and _doSNOW_[@progBarDoPar]
[^id_848]: to abide by the requirements of this report, the project had to be made up of only 2 R files : one for the report generation itself and one single file for the entire model. The _filecontents_ package from _MiKTeX_ has thus been used to embed the bibliography at the end of this report. Comparatively, the _inline_ R package could be used to embed Rcpp source code inside the model R source file itself. Here, the _Rcpp::sourceCpp_ function sufficed.
<!--
amc_pdf_print()
-->
In addition to the above mentionned arrangments, many tricks in the books had to be used to manipulate/operate on large objects efficiently (low memory consumption, fast execution times). For that reason, the entire algorithm has been developped from scratch with two exceptions : where available librairies already provided hard-to-beat performances, namely the _coop_ package for _cosine similarity matrix computation_ and the _stat_ package for _pca primary components decomposition_ :
* the coop package[^id_748] is the only one that could be identified as being able to provide a stable cosine similarity matrix computation for cases with _missing values_, i.e. cases like the one we have with the movielens dataset where not all user/movie pairs is assigned a rating. As a side advantage, it is really fast. For the readers eager to learn about any such algorithm inner workings, refer to the excellent post by Rebecca BARTER where we can find a native R version of a "pairwise.complete" matrix computation : [@nativeCosineSimilarity][^id_800].
* the _stat_ package performs really great in conjunction with _Microsoft R Open_ and _intel MKL_. if you're nevertheless interested in investigating the possibility of developping your own implementation, as a starting point, you can refer to Day 92 of Tomáš Bouda's 100 days of algorithm and his pca algorithm in R : [@pcainRalgo]
Of course, easilly available solutions such as the ones offered by the _recommenderlab_ package already exist. They are however very much craving in terms of memory and, very demanding in terms of processing power (which often translates in unbearably long training times).
Further enhancements can of course still be brought to the algorithm as delivered. It would for instance be an easy win to further extend the resort to Rcpp, or even relate to the _RcppParallel_ package. An other area that has not been explored but which could present advantages relies with _Microsoft MPI_[^id_560] in conjunction with the _Rmpi_ package[@installRmpi].
An even much larger gain could indeed be achieved if all these solutions were used in conjunction with the usage of GPU.
A detailed trace generated during the training of a fully cross-validated final model instance can be found [Appendix C](#appCanchor). It shows progress information together with timing measures.
[^id_748]: by order of increasing performance, also considered packages for that job have been _lsa_ and _proxy_.
[^id_800]: Rebecca BARTER introduces this similarity measure in the context of _Natural Language Processing (NLP)_ as _documents classification_ is an other area in which reliance over cosine similarity is widespread.
[^id_560]: a Microsoft implementation of the Message Passing interface standard[@mpiStandard] for developing and running parallel applications on a Windows platform.
\clearpage
<!--
amc_pdf_print()
-->
# Analysis
In the following section we're gonna explain the process and techniques used, such as data exploration and visualization, insights gained, and our modeling approach.
So let's start building this model, shall we ?
---
## K-fold cross validation{#crossValidAnchor}
We can not go any further in this report without dealing first with the concept of **cross validation**.
imagine that a user is in a particulary bad mood. He/she's had a very bad day. Then he/she rates a movie. The rating provided is well below the one that would have been given in any other circumstance.
It's for the trained model not to be too influenced by such individual anomalies (outliers) that modern machine learning systematically involves cross-validation via a resampling method such as k-fold[@kfoldVsBootstrap].
Lets picture our dataset for a moment as a french baguette[@frenchBaguette]. We start by putting aside a slice representing 10% of that baguette for latter validation of our final model instance (see [Results section](#resultsAnchor)). With the remaining 90%, lets cut k equal size slices. in our case, as schematized figure \ref{fig:fig_cv} below, we'll make k = 10 ; employ 10-folds in our model cross-validation.
We'll train a model instance on 9 slices of _training data_, optimize its hyperparameters against the 10th slice which is used as our _test data_. We do this 10 times, using each time a different slice as our test dataset.
```{r fig_cv, echo = FALSE, fig.height = .9, fig.cap="\\label{fig:fig_cv}Dataset splitting"}
data.frame(
x1 = c( 0, 9, 18, 27, 36, 45, 54
, 63, 72, 81, 90 )
, x2 = c( 9, 18, 27, 36, 45, 54
, 63, 72, 81, 90, 100 )
, y1 = rep( 0, 11 )
, y2 = rep( 1, 11 )
, t = c('a','a','a','a','a','a','a','a','a','a','b')
, r = c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "" ) ) %>%
ggplot() +
geom_rect(
mapping =
aes(
xmin = x1, xmax = x2, ymin =y1, ymax = y2
, fill = t
)
, color = "black"
, alpha = .5
, show.legend = FALSE ) +
geom_text(
aes( x = x1 + ( x2 - x1 ) / 2
, y = y1 + ( y2 - y1 ) / 2
, label = r )
, size = 4 ) +
scale_x_continuous( expand=c( -1, 101 ) ) +
scale_y_continuous(expand=c( 0, 0 ) ) +
theme( axis.line = element_blank()
#, axis.text.x = element_blank()
, axis.text.y = element_blank()
, axis.ticks = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, legend.position = "none"
, panel.background = element_blank()
, panel.border = element_blank()
, panel.grid=element_blank()
, plot.background = element_blank() ) +
annotate( geom = "text", label = "validation"
, x = 95, y = .5, angle = 90
, size = 3 )
```
> Please do kindly bear with me. _Hyperparameters optimization_ simply consists in measuring an optimization metric, in our case the RMSE, for different possible values of a parameter to be tuned in value. Each such value gives a different model RMSE value. The **optimum** value for that parameter is the one which gives the lowest RMSE for our model. We'll come to that in details in the following sections of the report.
<!--
amc_pdf_print()
-->
## Regularization
### user effect and movie effect
Figure \ref{fig:fig_eff} below shows that (a) not all users rate on average the same and that (b) not all movies get rated on average the same.
```{r fig_eff, echo = FALSE, fig.height = 1.8, fig.cap="\\label{fig:fig_eff}User & Movie effects"}
if( !is.null( trained_model_instance ) ) {
modeled_residuals <-
data.table::melt(
trained_model_instance$residuals_matrix
, na.rm = TRUE
)
colnames( modeled_residuals ) <-
c( "userId", "movieId", "residual" )
setDT( modeled_residuals, key = c( "userId", "movieId" ) )
rebuilt_ratings <-
modeled_residuals[
trained_model_instance$user_avgs, on = "userId" ][
trained_model_instance$movie_avgs, on = "movieId" ][
, -c( "title" ) ][
, rating :=
trained_model_instance$mu_hat +
b_u_hat + b_i_hat + residual
][
, -c( "residual", "b_u_hat", "b_i_hat" )
]
rm( modeled_residuals )
users_avg_ratings_plot <-
rebuilt_ratings %>% group_by( userId ) %>% summarize( avg = mean( rating ) ) %>%
ggplot( aes( x = avg ) ) +
geom_histogram( binwidth = .5
, color = "darkgrey"
, fill = "transparent" ) +
xlab( "average rating\n(a)") +
ylab( "users count")
movies_avg_ratings_plot <-
rebuilt_ratings %>% group_by( movieId ) %>% summarize( avg = mean( rating ) ) %>%
ggplot( aes( x = avg ) ) +
geom_histogram( binwidth = .5
, color = "darkgrey"
, fill = "transparent" ) +
xlab( "average rating\n(b)") +
ylab( "movies count")
rm( rebuilt_ratings )
grid.arrange(
users_avg_ratings_plot
, movies_avg_ratings_plot
, ncol = 2 )
rm( users_avg_ratings_plot, movies_avg_ratings_plot )
} else {
empty_plot
}
```
Our linear model of the users/movies ratings allocation can be written down as follows :
$$\hat{y_{i, u}} = \hat{\mu} + \hat{b_{i}} + \hat{b_{u}}$$
where $\mu$ is the average all all ratings, $b_{i}$ stands for the movie effect for movie "_i_" (e.g. the _"bias"_ for movie "_i_") and $b_{u}$ stands for the user effect for user "_u_" (e.g. the _"bias"_ for user "_u_"). With such a model, all in all, estimates for the terms $\mu + b_{i} + b_{u}$ equates to our prediction $\hat{ y_{u,i}}$.
And here's how the RMSE equation translates :
$$
\begin{aligned}
RMSE &= \sqrt{ \frac{1}{N} \sum_{u,i} (y_{u,i} - (\hat{\mu} + \hat{b_{i}} + \hat{b_{u}}))^2 } \\
&= \sqrt{ \frac{1}{N} \sum_{u,i} \epsilon_{u,i}^2 }
\end{aligned}
$$
Where $\epsilon_{u,i}$ are the _residuals_ to our linear model, i.e. the respective differences between each _true rating_ and our corresponding _prediction_.
<!--
amc_pdf_print()
-->
### regularization
As can be seen figure \ref{fig:fig_reg}(a) and (b) below, on the left part of the abscissa axises (x), many users provided little ratings and many movies have few ratings.
```{r fig_reg, echo = FALSE, fig.height = 1.8, fig.cap="\\label{fig:fig_reg}Distribution of ratings"}
if( !is.null( trained_model_instance ) ) {
users_ratings_count_plot <-
data.table(
x = rowSums( !is.na(
trained_model_instance$residuals_matrix
) ) ) %>%
ggplot( aes( x = x ) ) +
geom_histogram( bins = 100
, color = "darkgrey"
, fill = "transparent" ) +
scale_x_log10() +
xlab( "ratings count (log10)\n(a)") +
ylab( "users count") +
ggtitle( "users/ratings distribution")
movies_ratings_count_plot <-
data.table(
x = colSums( !is.na(
trained_model_instance$residuals_matrix
) ) ) %>%
ggplot( aes( x = x ) ) +
geom_histogram( bins = 100
, color = "darkgrey"
, fill = "transparent" ) +
scale_x_log10() +
xlab( "ratings count (log10)\n(b)") +
ylab( "movies count") +
ggtitle( "movies/ratings distribution")
grid.arrange(
users_ratings_count_plot
, movies_ratings_count_plot
, ncol = 2 )
rm( users_ratings_count_plot, movies_ratings_count_plot )
} else {
empty_plot
}
```
From the two above mentioned observations, one can conclude that some ratings are to be considered as more significant/representative than others (e.g. if a movie only has a 5 stars rating, is it really a 5 stars movie ? comparably, if a user only gave a 1 star rating, is it really a user that rates lower than the average users ? too little records are not enough to draw conclusions).
To the RMSE equation, we can thus add a term as a penalty that gets larger whit large $b_{i}$ and/or $b_{u}$. By doing so, we shrink the coefficient estimates towards zero (no user bias, no movie bias, for non statistically significant records).
$$RMSE_{penalized} = \sqrt{ \frac{1}{N} \sum_{u,i} (y_{u,i} - (\hat{\mu} + b_{i} + b_{u}))^2 \quad + \quad \lambda (\sum_{i} b_{i}^2 + \sum_{u} b_{u}^2) }$$
Using calculus it can be shown that the values of $b_{i}$ (and $b_{u}$) that minimize this equation are for cases with the number of movies "_i_" per user is larger than $\lambda$ (and the number of users "_u_" per movie is larger than $\lambda$), where $\lambda$ is the _penalization term_ ; the hyperparameter to be optimized during model training.
For each of the model instances trained during k-fold cross validation, we'll establish the optimum value of $\lambda$ ; the one minimizing $RMSE_{(\lambda)}$. We'll finally apply the average over these "k" different values for $\lambda$ to our final model. Stay tuned !
## K Nearest Neighbors {#knnAnchor}
The final result doesn't look anything like it but this August 2015 **datascience.stackexchange** post by Bartłomiej TWARDOWSKi served as the initial inspiration for the _knn training_ part of the algorithm developped in the frame of the herein report :[@rKnnAlgo].
There are several popular means of comparing items via measuring distances/similarities[@distSimMeasures]. The Netflix^TM^ challenge winning team went for the **cosine similarity**. The Cosine Similarity between two movies is the computation of the cosine of the angle between two "movie" vectors in the "movies" space :
$$
\begin{aligned}
similarity( \vv{A}, \vv{B} ) & = cos( \Theta )
\\
& = \frac {\vv{A} \cdot \vv{B}}{||\vv{A}|| \cdot ||\vv{B}||}
\\
& = \frac{\sum_{i = 1}^N A_i B_i}
{\sqrt{\left(\sum_{i = 1}^N A_i^2\right)
\left(\sum_{i = 1}^N B_i^2\right)}}
\end{aligned}
$$
```{r echo = FALSE }
movies_count <-
format( nrow( trained_model_instance$movie_avgs ), nsmall = 1, big.mark = "," )
users_count <-
format( nrow( trained_model_instance$user_avgs ), nsmall = 1, big.mark = "," )
```
In a 3D environment (N = 3), it is fairly easy to compute. Both the **A** and **B** vectors have three coordinates.
In our case of computing rating similarities among movies, we however are in an environment of **`r movies_count `** dimensions (N = `r movies_count ` different movies)[^id_222].
```{r echo = FALSE, results = 'asis' }
if( is.null( trained_model_instance ) || nrow( trained_model_instance$user_avgs ) > nrow( trained_model_instance$movie_avgs ) ) cat( paste0(
"This is for instance why we choose to go with \"movies\" knn here and not \"users\" knn : our dataset englobes **", users_count, "** different users. This would have made computation of the similarities matrix way too long[^id_333]."
) )
```
```{r echo = FALSE }
one_movie_column_nb <- 525
if( !is.null( trained_model_instance ) ) {
movie_similar_movies <-
trained_model_instance$similarity_matrix[ , one_movie_column_nb ] # [ , movieId == 593 ]
0 -> movie_similar_movies[ is.na( movie_similar_movies ) ]
movie_similar_movies_plot <-
data.frame(
x = movie_similar_movies ) %>%
ggplot( aes( x = x ) ) +
geom_histogram( aes( y = ..count.. )
, binwidth = .25
, fill = "transparent"
, color = "darkgrey" ) +
xlab( "similarity" ) +
ylab( "movies count" ) +
scale_x_continuous(
breaks = c( -1, -0.875, -0.625, -0.375, -0.125
, 0.125, 0.375, 0.625, 0.875, 1 )
, labels = c( "-1", "\n-0.875", "-0.625", "\n-0.375", "-0.125"
, "\n0.125", "0.375", "\n0.625", "0.875", "\n1" )
) +
stat_bin( aes( y = ..count..
, label =
format( ..count..
, nsmall = 0
, big.mark = "," ) )
, geom="text", vjust=.1
, binwidth = .25, size = 8
, color = "grey42" )
rm( movie_similar_movies )
# trained_model_instance$movie_avgs[ movieId == 593 ]
# trained_model_instance$movie_avgs[ one_movie_column_nb ]
} else {
movie_similar_movies_plot <- empty_plot
}
```
\Begin{wrapfigure}{r}{0.3\textwidth}
```{r include=TRUE, echo=FALSE, fig.show='asis', fig.keep='high' }
movie_similar_movies_plot + theme_amc( base_size_ = 28 )
```
\caption{\label{fig:movie_similar_movies}Cosine similarity of a movie against all the others}
\End{wrapfigure}
```{r echo = FALSE }
rm( movie_similar_movies_plot )
```
<!--
amc_pdf_print()
-->
Similarity ranges from -1 (exactly opposite), to 1 (exactly the same), with 0 indicating orthogonality or decorrelation, while in-between values indicate intermediate similarity or dissimilarity.
For instance, for the movie **"`r trained_model_instance$movie_avgs[ one_movie_column_nb ]$title `"**, figure **\ref{fig:movie_similar_movies}** shows the distribution of similarities with each of the `r movies_count ` movies of our dataset[^id_010].
Be reminded that we consider residuals from regularized linear modelling of ratings here.
```{r echo = FALSE }
rm( movies_count, users_count, one_movie_column_nb )
```
[^id_222]: this well known fact is called **curse of dimensionnality**. A very educational explaination of that phenomenon can be found there : [@curseOfDim].
[^id_333]: it is interesting to note however that, the higher the number of dimensions, the better the accuracy of the knn predictions, since discriminating between user rating profiles is more precise the more different users there are (e.g. the more users there are, the more similar to a particular user are its k closest neighbors). Going with "users" knn modeling would have brought to an increased accuracy.
[^id_010]: we consider pairwise similarities (see [Little geek section](#technicalAnchor)).
---
<!--
amc_pdf_print()
-->
\clearpage
## Primary Components Analysis
```{r echo = FALSE }
prcomp <- min(
20
, ifelse( !is.null( trained_model_instance)
, trained_model_instance$prcomp_rank
, 20 ) )
prcomp_tab <- min( 15, prcomp )
sample_users_count <- 80 # 30 #
sample_movies_count <- 40 # 25 #
```
Matrix factorization is a mathematical method that makes it possible to decompose a matrix into two matrices of lower dimensions[^id_811].
The variability of the ratings (with users and movies) can thus be decomposed in two matrices : the **weights** matrix and the **pattern** matrix.
Included figure \ref{fig:fig_matfac}, we can see a graphical representation of a matrix factorization of a sample[^id_999] from our dataset.
$$ residuals\_matrix = weights\_matrix * pattern\_matrix^T$$
The principle consists in considering the right amount of variability in the training dataset, meaning isolating and ignoring the part in the variability in the training data that relates purely to randomness so as to ignore it in future predictions. Taking into account the right amount of primary components brings the best ability to predict future observation. This amount also is an hyperparameter to be optimized during model training.
[^id_811]: an other interesting approach than the one adopted in this report to introduce matrix factorization can be found there :[@pcaGuide], with further insights there :[@pcainR].
[^id_999]: a random sample of ratings for `r sample_users_count` users and `r sample_movies_count` movies, where only the first `r prcomp` primary components are shown.
### The "weights" matrix
The **weights** matrix has _"users count"_ rows and is made up of uncorrelated **primary component** columns, each of which have a decreasing standard deviation (e.g. a decreasing variability). To put it simply, **PC1**, the first primary component, the one with the highest standard deviation, translates the movie characteristic that divides the most among our set of users. Table \ref{tab:sdev_tab} below shows the respective standard deviation of the `r prcomp_tab` first primary components of the entire dataset.
```{r echo = FALSE }
prcomp <- min(
20
, ifelse( !is.null( trained_model_instance)
, trained_model_instance$prcomp_rank
, 20 ) )
prcomp_tab <- min( 15, prcomp )
sample_users_count <- 80 # 30 #
sample_movies_count <- 40 # 25 #
if( !is.null( trained_model_instance ) ) {
var_explained_df <-
t( data.frame(
rank = as.character( round( as.integer( 1:prcomp_tab ), 0 ) )
, "sdev.percent" = round( trained_model_instance$pca$sdev[ 1:prcomp_tab ], 2 ) ) )
knitr::kable(
var_explained_df
, format = "latex", booktabs = TRUE
, caption = "\\label{tab:sdev_tab}Primary compnents standard deviation"
, row.names = TRUE
, align = paste0( rep( "c", prcomp_tab )
, collapse = "" )
) %>%
kableExtra::kable_styling(
font_size = 8, latex_options = c( "hold_position" ) )
} else {
knitr::kable(
empty_table
, format = "latex", booktabs = TRUE
, caption = "\\label{tab:sdev_tab}" ) %>%
kableExtra::kable_styling(
font_size = 8, latex_options = c( "hold_position" ) ) %>%
row_spec( 1:nrow( empty_table ), color = "blue" )
}
```
<!--
amc_pdf_print()
-->
### The "pattern" matrix
The **pattern** matrix has _"movies count"_ rows and encompasses the relation between each movie and each primary component. It's in the _pattern_ matrix that we can identify clusters of movies. For instance, looking at _PC1_, the first primary component, the one explaining for most of the user ratings variability, we can guesstimate what the most important characteristic within our set of movies is.
```{r echo = FALSE }
if( !is.null( trained_model_instance ) ) {
residuals_pcs <- data.frame(
round( trained_model_instance$pca$rotation
, digits = 4 )
, title = trained_model_instance$movie_avgs$title
, stringsAsFactors = FALSE )
pc_bottom_tab <-
residuals_pcs %>%
mutate( title = str_trunc(
title, 30 ) ) %>%
dplyr::select( title, PC1 ) %>%
arrange( PC1 ) %>%
slice( 1:10 )
pc_top_tab <-
residuals_pcs %>%
mutate( title = str_trunc(
title, 30 ) ) %>%
dplyr::select( title, PC1 ) %>%
arrange( desc( PC1 ) ) %>%
slice( 1:10 )
dev_dataset <-
( pc_top_tab$title[ 1 ] ==
"independence Day (a.k.a. iD..." )
} else {
pc_bottom_tab <- empty_table
pc_top_tab <- empty_table
dev_dataset <- FALSE
}
```
Table \ref{tab:top_pc1} lists the 10 movies that are on one extreme end of PC1 and table \ref{tab:bottom_pc1} the 10 movies that are on the other. From there, we can interject that it separates `r ifelse( !dev_dataset, "thrillers from disaster films", "'science fiction / superheros' movies from drama" )`[^id_707]
```{r echo = FALSE, results = 'asis' }
cat( paste0( "\\begin{table}[!htb]
\\begin{minipage}{.5\\linewidth}
\\centering
\\caption{Top 10 PC1 movies}\\label{tab:top_pc1}
", knitr::kable( pc_top_tab
, format = "latex"
, booktabs = TRUE
, linesep = "" ) %>%
row_spec( 1:nrow( pc_top_tab )
, color = ifelse( !is.null( trained_model_instance )
, "black", "blue" ) ) %>%
kableExtra::kable_styling(
font_size = 8 )
, "\\end{minipage}%
\\begin{minipage}{.5\\linewidth}
\\centering
\\caption{Reverse top 10 PC1 movies}\\label{tab:bottom_pc1}
", knitr::kable( pc_bottom_tab
, format = "latex"
, booktabs = TRUE
, linesep = "" ) %>%
row_spec( 1:nrow( pc_top_tab )
, color = ifelse( !is.null( trained_model_instance )
, "black", "blue" ) ) %>%
kableExtra::kable_styling(
font_size = 8 )
, "\\end{minipage}
\\end{table}" ) )
```
Comparatively, with table \ref{tab:top_pc2} and \ref{tab:bottom_pc2}, we can do the same for **PC2**, the second primary component, which then seems to translate the dichotomy in the tastes of our set of users between `r ifelse( !dev_dataset, "'science fiction / superheros' movies and drama", "fantasy movies and 'violence & crimes' movies" )`.
```{r echo = FALSE, results = 'asis' }
if( !is.null( trained_model_instance ) ) {
pc_bottom_tab <-
residuals_pcs %>%
mutate( title = str_trunc(
title, 30 ) ) %>%
dplyr::select( title, PC2 ) %>%
arrange( PC2 ) %>%
slice( 1:10 )
pc_top_tab <-
residuals_pcs %>%
mutate( title = str_trunc(
title, 30 ) ) %>%
dplyr::select( title, PC2 ) %>%
arrange( desc( PC2 ) ) %>%
slice( 1:10 )
rm( residuals_pcs )
} else {
pc_bottom_tab <- empty_table
pc_top_tab <- empty_table
}
cat( paste0( "\\begin{table}[!htb]
\\begin{minipage}{.5\\linewidth}
\\centering
\\caption{Top 10 PC2 movies}\\label{tab:top_pc2}
", knitr::kable( pc_top_tab
, format = "latex"
, booktabs = TRUE
, linesep = "" ) %>%
row_spec( 1:nrow( pc_top_tab )
, color = ifelse( !is.null( trained_model_instance )
, "black", "blue" ) ) %>%
kableExtra::kable_styling(
font_size = 8 )
, "\\end{minipage}%
\\begin{minipage}{.5\\linewidth}
\\centering
\\caption{Reverse top 10 PC2 movies}\\label{tab:bottom_pc2}
", knitr::kable( pc_bottom_tab
, format = "latex"
, booktabs = TRUE
, linesep = "" ) %>%
row_spec( 1:nrow( pc_top_tab )
, color = ifelse( !is.null( trained_model_instance )
, "black", "blue" ) ) %>%
kableExtra::kable_styling(
font_size = 8 )
, "\\end{minipage}
\\end{table}" ) )
rm( pc_top_tab, pc_bottom_tab )
```
[^id_707]: subject to interpretation, primary components are obviously varying with different datasets.
<!--
amc_pdf_print()
-->
### Variance explained
```{r echo = FALSE }
if( !is.null( trained_model_instance ) ) {
sdev_vectors_list <-
lapply( trained_model_instance$cv_k_models %>% select( pca )
, function(
pca_item ) lapply( pca_item
, function( pca_item ) pca_item$sdev
) )$pca
sdev_table <-
data.table( array( unlist( sdev_vectors_list )
, dim =
c( length( sdev_vectors_list[[ 1 ]] )
, length( sdev_vectors_list ) )
) )
colnames( sdev_table ) <-
paste0( "fold_"
, seq( 1, length( sdev_vectors_list ) ) )
rownames( sdev_table ) <-
seq( 1 : length( sdev_vectors_list[[ 1 ]] ) )
sdev_table <-
bind_cols(
rank = seq( 1 : length( sdev_vectors_list[[ 1 ]] ) )
, sdev_table
, avg = rowMeans( sdev_table )
, final = trained_model_instance$pca$sdev )
rm( sdev_vectors_list )
#str( sdev_table )
var_explained_df <-
sdev_table %>%
melt( id = "rank"
, variable.name = "model_instance"
, value.name = "sdev" ) %>%
group_by( model_instance ) %>%
mutate( var_explained =
cumsum( sdev^2 / sum( sdev^2 ) ) ) %>%
ungroup
rm( sdev_table )
} else {
var_explained_df <- NULL
}
```