-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKDD_HedayatiJP.tex
207 lines (185 loc) · 7.43 KB
/
KDD_HedayatiJP.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
% This is "sig-alternate.tex" V2.0 May 2012
% This file should be compiled with V2.5 of "sig-alternate.cls" May 2012
%
% This example file demonstrates the use of the 'sig-alternate.cls'
% V2.5 LaTeX2e document class file. It is for those submitting
% articles to ACM Conference Proceedings WHO DO NOT WISH TO
% STRICTLY ADHERE TO THE SIGS (PUBS-BOARD-ENDORSED) STYLE.
% The 'sig-alternate.cls' file will produce a similar-looking,
% albeit, 'tighter' paper resulting in, invariably, fewer pages.
%
% ----------------------------------------------------------------------------------------------------------------
% This .tex file (and associated .cls V2.5) produces:
% 1) The Permission Statement
% 2) The Conference (location) Info information
% 3) The Copyright Line with ACM data
% 4) NO page numbers
%
% as against the acm_proc_article-sp.cls file which
% DOES NOT produce 1) thru' 3) above.
%
% Using 'sig-alternate.cls' you have control, however, from within
% the source .tex file, over both the CopyrightYear
% (defaulted to 200X) and the ACM Copyright Data
% (defaulted to X-XXXXX-XX-X/XX/XX).
% e.g.
% \CopyrightYear{2007} will cause 2007 to appear in the copyright line.
% \crdata{0-12345-67-8/90/12} will cause 0-12345-67-8/90/12 to appear in the copyright line.
%
% ---------------------------------------------------------------------------------------------------------------
% This .tex source is an example which *does* use
% the .bib file (from which the .bbl file % is produced).
% REMEMBER HOWEVER: After having produced the .bbl file,
% and prior to final submission, you *NEED* to 'insert'
% your .bbl file into your source .tex file so as to provide
% ONE 'self-contained' source file.
%
% ================= IF YOU HAVE QUESTIONS =======================
% Questions regarding the SIGS styles, SIGS policies and
% procedures, Conferences etc. should be sent to
% Adrienne Griscti (griscti@acm.org)
%
% Technical questions _only_ to
% Gerald Murray (murray@hq.acm.org)
% ===============================================================
%
% For tracking purposes - this is V2.0 - May 2012
\documentclass{sig-alternate-05-2015}
\newtheorem{algorithm}{Algorithm}
\usepackage{algpseudocode}
\usepackage{varwidth}
\usepackage{graphicx}
\usepackage{color}
\usepackage{url}
\graphicspath{{./images}}
\begin{document}
\setcopyright{acmcopyright}
\doi{10.475/123_4}
% ISBN
\isbn{123-4567-24-567/08/06}
%Conference
\conferenceinfo{PLDI '13}{June 16--19, 2013, Seattle, WA, USA}
\acmPrice{\$15.00}
%
% --- Author Metadata here ---
%\conferenceinfo{WOODSTOCK}{'97 El Paso, Texas USA}
%
% --- Author Metadata here ---
%\conferenceinfo{WOODSTOCK}{'97 El Paso, Texas USA}
%\CopyrightYear{2007} % Allows default copyright year (20XX) to be over-ridden - IF NEED BE.
%\crdata{0-12345-67-8/90/01} % Allows default copyright data (0-89791-88-6/97/05) to be over-ridden - IF NEED BE.
% --- End of Author Metadata ---
\title{Scalable Learning of Tree-Based Models on Sparsely Representable Data}
%
% You need the command \numberofauthors to handle the 'placement
% and alignment' of the authors beneath the title.
%
% For aesthetic reasons, we recommend 'three authors at a time'
% i.e. three 'name/affiliation blocks' be placed beneath the title.
%
% NOTE: You are NOT restricted in how many 'rows' of
% "name/affiliations" may appear. We just ask that you restrict
% the number of 'columns' to three.
%
% Because of the available 'opening page real-estate'
% we ask you to refrain from putting more than six authors
% (two rows with three columns) beneath the article title.
% More than six makes the first-page appear very cluttered indeed.
%
% Use the \alignauthor commands to handle the names
% and affiliations for an 'aesthetic maximum' of six authors.
% Add names, affiliations, addresses for
% the seventh etc. author(s) as the argument for the
% \additionalauthors command.
% These 'additional authors' will be output/set for you
% without further effort on your part as the last section in
% the body of your article BEFORE References or any Appendices.
\numberofauthors{3} % in this sample file, there are a *total*
% of EIGHT authors. SIX appear on the 'first-page' (for formatting
% reasons) and the remaining two appear in the \additionalauthors section.
%
\author{
% You can go ahead and credit any number of authors here,
% e.g. one 'row of three' or two rows (consisting of one row of three
% and a second row of one, two or three).
%
% The command \alignauthor (no curly braces needed) should
% precede each author name, affiliation/snail-mail address and
% e-mail address. Additionally, tag each line of
% affiliation/address with \affaddr, and tag the
% e-mail address with \email.
%
% 1st. author
\alignauthor
Fares Hedayati\\\\
\affaddr{upwork}\\
\email{fares19@upwork.com}
% 2nd. author
\alignauthor
Arnauld Joly\\\\
\affaddr{University of Li{e}ge}\\
\email{a.joly@ulg.ac.be}
% 3rd. author
\alignauthor Panagiotis Papadimitriou\\
\affaddr{upwork}\\
\email{papadimitriou@upwork.com}
}
% There's nothing stopping you putting the seventh, eighth, etc.
% author on the opening page (as the 'third row') but we ask,
% for aesthetic reasons that you place these 'additional authors'
% in the \additional authors block, viz.
\date{10 February 2016}
\maketitle
\begin{abstract}
Many machine learning tasks such as text annotation usually require
training over very big datasets, e.g., millions of web documents,
that can be represented in a sparse input space. State-of-the-art
tree-based ensemble algorithms cannot scale to such datasets, since
they include operations whose running time is a function of the
input space size rather than a function of the non-zero input
elements.
In this paper, we propose an efficient splitting algorithm to
leverage input sparsity within decision tree
methods.
Our algorithm improves training time over sparse datasets by more
than two orders of magnitude and it has been incorporated in the current
version of \emph{scikit-learn}\footnote{http://scikit-learn.org},
the most popular open source Python machine learning library.
\end{abstract}
% A category with the (minimum) three required fields
%\category{Decision Trees}
%A category including the fourth, optional field follows...
%\category{Big Data, Scalable methods}
\terms{Algorithms, Experimentation, Performance}
\keywords{machine learning, classification trees, regression trees, sparse data, scalability}
\input{intro.tex}
\input{dense.tex}
\input{sparse.tex}
\input{experiment.tex}
\input{related.tex}
\section{Conclusion}
\label{sec:conclusion}
We proposed a method for building tree-based models with sparse input
support. Our method takes advantage of input sparsity by avoiding
sorting sample sets of a node along a feature unless they are nonzero
at that feature. This approach speeds up training substantially as
sorting is a costly but an essential and ubiquitous component of
tree-based models.
\section{Acknowledgment}
Arnaud Joly is a research fellow of the FNRS, Belgium. This work is
partially supported by PASCAL2 and the IUAP DYSCO, initiated by the
Belgian State, Science Policy Office.\\
\bibliographystyle{abbrv}
\bibliography{references}
%
%\bibliography{sigproc} % sigproc.bib is the name of the Bibliography in this case
% You must have a proper ".bib" file
% and remember to run:
% latex bibtex latex latex
% to resolve all references
%
% ACM needs 'a single self-contained file'!
%
%APPENDICES are optional
%\balancecolumns
\end{document}