-
Notifications
You must be signed in to change notification settings - Fork 0
/
rapport.aux
212 lines (212 loc) · 14.7 KB
/
rapport.aux
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
\relax
\select@language{english}
\@writefile{toc}{\select@language{english}}
\@writefile{lof}{\select@language{english}}
\@writefile{lot}{\select@language{english}}
\citation{*}
\citation{WikiNGS}
\@writefile{toc}{\contentsline {chapter}{\numberline {1}Scientific context}{1}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {1.1}Definition and goals of metagenomics}{1}}
\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces An example of alignment of DNA sequence, from \emph {Gap penalty} article in \textsc {Wikipedia}. The upper read is the reference to which the lower read is aligned}}{2}}
\@writefile{toc}{\contentsline {section}{\numberline {1.2}Assignment/identification of reads to a certain species of bacteria}{3}}
\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces A partial taxonomic tree from \textsc {GreenGenes} and the taxonomy of the fox, from \emph {Taxonomic Rank} article in \textsc {Wikipedia}}}{3}}
\citation{Tarjan}
\citation{Tango1}
\citation{Tango3}
\citation{Tango1}
\citation{RobinsonFoulds}
\citation{Alignment}
\citation{Enaud}
\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces The dark green node is the LCA of \emph {x} and \emph {y}, from \emph {Lowest Common Ancestor} article in \textsc {Wikipedia}}}{4}}
\@writefile{toc}{\contentsline {section}{\numberline {1.3}Topics in metagenomics}{4}}
\citation{Wilcoxon}
\citation{Whitney}
\citation{McNemar}
\citation{Enaud}
\@writefile{toc}{\contentsline {section}{\numberline {1.4}The test database}{5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {2}Subject of the internship}{6}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {2.1}Problem of most different pairs (MDPairs)}{6}}
\@writefile{lof}{\contentsline {figure}{\numberline {2.1}{\ignorespaces A screenshot of the occurrence matrix from the test database}}{7}}
\@writefile{lof}{\contentsline {figure}{\numberline {2.2}{\ignorespaces A screenshot of the info matrix from the test database}}{7}}
\@writefile{toc}{\contentsline {section}{\numberline {2.2}Problems of clustering (CLust)}{7}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Problem of compatibility (CL.Comp)}{8}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Problem of best clustering (CL.BCLust)}{8}}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Statistics-inspired approach (TaxoTree software)}{10}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {3.1}Definitions}{10}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.1}{\ignorespaces Subtrees induced by three groups of samples (each corresponding to a single colour). Incuced subtrees can of course overlap.}}{10}}
\@writefile{toc}{\contentsline {section}{\numberline {3.2}Description of the method}{11}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.1}Total Ratio}{11}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.2}{\ignorespaces Example for \textsc {Total Ratio}: blue nodes are common to green and red trees. Green and red nodes correspond to two distinct groups of samples. Here, $TR$ = ${\begingroup 10 + 3 + 34\endgroup \over (1 + 34 + 5) + (10 + 3 + 34) + (24 + 16)} \simeq 0.37$}}{11}}
\citation{Enaud}
\citation{PhyloD}
\citation{MeanD}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.2}Pattern Ratio}{12}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.3}{\ignorespaces Example for \textsc {Pattern Ratio}: violet nodes are common to cyan and yellow trees. Cyan and yellow nodes correspond to two distinct groups of samples. Here, $PR$ = ${\begingroup 10 + 34 + 3 + 24\endgroup \over (1 + 34 + 12) + (23 + 45 + 16 + 5)} \simeq 0.52$}}{12}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.3}Microbial Diversity}{12}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.4}{\ignorespaces Example for \textsc {Microbial Diversity}: Here, \textsc {Microbial Diversity} coefficient for violet tree is: $MD$ = ${\begingroup 5\endgroup \over 12} \simeq 0.42$}}{13}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.4}Computation of the similarity coefficient}{13}}
\citation{Pearson}
\citation{Enaud}
\@writefile{toc}{\contentsline {section}{\numberline {3.3}Implementation}{14}}
\@writefile{toc}{\contentsline {section}{\numberline {3.4}Results}{15}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.1}Tests}{15}}
\@writefile{lot}{\contentsline {table}{\numberline {3.1}{\ignorespaces Results from \textsc {TaxoTree} for the comparison at Day 0, (*) with Day = 0, (**) with (Clostridium bolteae,S),(Pediococcus acidilactici,S), where S is one of the taxonomic ranks}}{15}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.5}{\ignorespaces Microbial Diversity Day=0, ATB-IV=0 (top) and ATB-IV=1 (bottom)}}{16}}
\@writefile{lot}{\contentsline {table}{\numberline {3.2}{\ignorespaces Results from \textsc {TaxoTree} for the comparison at Day 0, Day 45, Day 90}}{17}}
\@writefile{lot}{\contentsline {table}{\numberline {3.3}{\ignorespaces Results from \textsc {TaxoTree} for the comparison at Day 90, (*) for Day 0, (**) for Day 90}}{17}}
\@writefile{lof}{\contentsline {figure}{\numberline {3.6}{\ignorespaces Microbial Diversity Day=0, ATB-IV=0 (top) and ATB-IV=1 (bottom)}}{18}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.2}Overview and discussion}{19}}
\@writefile{toc}{\contentsline {subsubsection}{About the method}{19}}
\@writefile{toc}{\contentsline {subsubsection}{About the numerical results}{19}}
\citation{SSS}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}Supervised learning (TaxoClassifier software)}{20}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {4.1}Machine Learning and supervised learning: the Naive Bayes classifier}{20}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.1}Machine Learning}{20}}
\citation{Nikolski}
\citation{NaiveBayes}
\citation{NaiveBayes}
\citation{NaiveBayes}
\citation{NaiveBayes}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.2}Supervised Learning}{21}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.3}Naive Bayes Classifier}{21}}
\citation{NaiveBayes}
\citation{Youden}
\citation{F-measure}
\citation{IntroROC}
\citation{FMproblems}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.4}Youden's J coefficient}{22}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces \textsc {Confusion matrix} describing the four classes from gepsoft.com}}{22}}
\citation{F-measure}
\citation{F-measure}
\@writefile{toc}{\contentsline {section}{\numberline {4.2}Description of the method}{23}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Let $M_1$ be a metadatum having values 0 (red), or 1 (green), or 2 (blue). Then there are three classes associated with $M_1$.}}{23}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Let $M_2$ be a metadatum having values 0 (yellow), or 1 (cyan). Then there are two classes associated with $M_2$, that may intersect classes of $M_1$. Thus there are four classes associated with $M_1$ and $M_2$ (classes do not take into account unknown values)}}{24}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}Implementation}{25}}
\@writefile{toc}{\contentsline {section}{\numberline {4.4}Results}{25}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}Tests}{25}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Overview and discussion}{25}}
\@writefile{toc}{\contentsline {subsubsection}{About the method}{25}}
\citation{BayesianRating}
\citation{PartitionIsNPhard}
\citation{NPhard}
\citation{KMeans}
\citation{KMeans}
\@writefile{toc}{\contentsline {chapter}{\numberline {5}Non-supervised learning (TaxoCluster software)}{28}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {5.1}Machine Learning and non-supervised learning: the K-Means algorithm}{28}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.1}Non-Supervised Learning}{28}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.2}Clustering}{28}}
\citation{Tango1}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.3}K-Means Algorithm}{29}}
\@writefile{toc}{\contentsline {section}{\numberline {5.2}Definitions}{29}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Notations}{29}}
\@writefile{lof}{\contentsline {figure}{\numberline {5.1}{\ignorespaces Let T be the whole taxonomic tree. Then for the set of red nodes, the subtree associated is rooted at the violet node, and the total set of leaves for this subtree is the set of yellow and red nodes.}}{29}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}Distances}{29}}
\citation{Consensus}
\@writefile{lof}{\contentsline {figure}{\numberline {5.2}{\ignorespaces Let us consider the trees $T_{1}$ and $T_{2}$ rooted at A (trees are drawn separately for simplicity's sake), such as $M_{1}$ = \{D, E, F, H\} and $M_{2}$ = \{F, G, H\} (thus matched leaves are in green). Then $d_{consensus}(T_{1},T_{2}) = 5 + 5 - q \times (1 + 2) - 2$. When $q = 0$, $d_{consensus}(T_{1},T_{2}) = 8$. When $q = 1$, $d_{consensus}(T_{1},T_{2}) = 5$}}{30}}
\@writefile{toc}{\contentsline {section}{\numberline {5.3}Description of the method}{30}}
\@writefile{toc}{\contentsline {section}{\numberline {5.4}Implementation}{31}}
\@writefile{toc}{\contentsline {section}{\numberline {5.5}Results}{31}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.1}Tests}{31}}
\citation{PCA}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.2}Overview and discussion}{32}}
\@writefile{toc}{\contentsline {subsubsection}{About the method}{32}}
\@writefile{toc}{\contentsline {chapter}{\numberline {6}Comparison of the three approaches}{33}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{lot}{\contentsline {table}{\numberline {6.1}{\ignorespaces Complexity for each method: for our test database, $n_{taxo-nodes} = 9,065$, $n_{values} \le 10$, $n_{samples} = 47$ and $n_{paths} = 5,565$}}{33}}
\citation{Nikolski}
\citation{TreeInclusion}
\@writefile{toc}{\contentsline {chapter}{\numberline {7}Outlook}{35}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {7.1}Personal work}{35}}
\@writefile{toc}{\contentsline {section}{\numberline {7.2}Comments}{35}}
\bibstyle{plain}
\bibdata{rapport.bib}
\bibcite{Consensus}{1}
\bibcite{Tango3}{2}
\bibcite{Correction}{3}
\bibcite{Tango1}{4}
\bibcite{Scripting}{5}
\bibcite{PartitionIsNPhard}{6}
\bibcite{Enaud}{7}
\bibcite{IntroROC}{8}
\bibcite{Tarjan}{9}
\bibcite{Alignment}{10}
\bibcite{TreeInclusion}{11}
\bibcite{KMeans}{12}
\bibcite{Whitney}{13}
\bibcite{BayesianRating}{14}
\bibcite{McNemar}{15}
\@writefile{toc}{\contentsline {chapter}{Bibliography}{37}}
\bibcite{Pearson}{16}
\bibcite{PCA}{17}
\bibcite{FMproblems}{18}
\bibcite{BernouilliModelDrawbacks}{19}
\bibcite{F-measure}{20}
\bibcite{RobinsonFoulds}{21}
\bibcite{NeighborJoining}{22}
\bibcite{SSS}{23}
\bibcite{UMPGA}{24}
\bibcite{Nikolski}{25}
\bibcite{PhyloD}{26}
\bibcite{MeanD}{27}
\bibcite{NaiveBayes}{28}
\bibcite{BernouilliModel}{29}
\bibcite{NPhard}{30}
\bibcite{WikiNGS}{31}
\bibcite{Wilcoxon}{32}
\bibcite{Youden}{33}
\bibcite{TreeDistance}{34}
\@writefile{toc}{\contentsline {chapter}{\numberline {A}Hardware characteristics of the testing computer}{40}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {B}Complexity}{41}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {B.1}Worst case complexity for the first approach (TaxoTree)}{41}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.1.1}Per function}{41}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.1.2}Overall complexity}{43}}
\@writefile{toc}{\contentsline {section}{\numberline {B.2}Worst case complexity for second approach (TaxoClassifier)}{43}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.2.1}Per function}{43}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.2.2}Overall complexity}{44}}
\@writefile{toc}{\contentsline {section}{\numberline {B.3}Worst case complexity for third approach (TaxoCluster)}{44}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.3.1}Per function}{44}}
\@writefile{toc}{\contentsline {subsection}{\numberline {B.3.2}Overall complexity}{45}}
\citation{NeighborJoining}
\citation{UMPGA}
\@writefile{toc}{\contentsline {chapter}{\numberline {C}Construction of taxonomic trees}{46}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {C.1}State-of-the-art and input}{46}}
\@writefile{toc}{\contentsline {section}{\numberline {C.2}A naive top-down construction}{47}}
\@writefile{toc}{\contentsline {section}{\numberline {C.3}A naive bottom-up construction}{47}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces The naive top-down construction}}{48}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces The naive bottom-up construction}}{49}}
\@writefile{toc}{\contentsline {section}{\numberline {C.4}A less naive bottom-up construction}{50}}
\@writefile{toc}{\contentsline {subsection}{\numberline {C.4.1}Pre-processing}{50}}
\@writefile{toc}{\contentsline {subsection}{\numberline {C.4.2}Final algorithm}{50}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces The less naive bottom-up construction (pre-processing)}}{51}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces The less naive bottom-up construction}}{53}}
\@writefile{toc}{\contentsline {chapter}{\numberline {D}Multi-dimensional lists}{54}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {D.1}Goal}{54}}
\@writefile{toc}{\contentsline {section}{\numberline {D.2}Implementation and complexity}{54}}
\@writefile{lot}{\contentsline {table}{\numberline {D.1}{\ignorespaces Worst case time complexity of different operations on MDL, (*) The \emph {deepcopy} operation is linear, but the constant is great, (**) Returns first element and the list of other elements, (***) Maps function over the elements of MDL, then returns them as a list}}{55}}
\@writefile{toc}{\contentsline {chapter}{\numberline {E}Summary (in French)}{56}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\newlabel{LastPage}{{}{58}}
\xdef\lastpage@lastpage{58}
\gdef\lastpage@lastpageHy{}