lecture_note.bib

@book{carnie2013syntax,
    title={Syntax: A generative introduction},
    author={Carnie, Andrew},
    year={2013},
    publisher={John Wiley \& Sons}
}

@article{chomsky1959review,
    title={A review of {B. F.} Skinner's Verbal Behavior},
    author={Chomsky, Noam},
    journal={Language},
    volume={35},
    number={1},
    pages={26--58},
    year={1959}
}

@article{pesetsky1999linguistic,
    title={Linguistic universals and universal grammar},
    author={Pesetsky, David},
    journal={The {MIT} Encyclopedia of the Cognitive Sciences [Z]. Eds. RA
        Wilson \& FC Keil. Cambridge, MA: The {MIT} Press},
    year={1999}
}

@book{manning1999foundations,
    title={Foundations of statistical natural language processing},
    author={Manning, Christopher D and Sch{\"u}tze, Hinrich},
    year={1999},
    publisher={MIT press}
}

@article{winograd1972understanding,
    title={Understanding natural language},
    author={Winograd, Terry},
    journal={Cognitive psychology},
    volume={3},
    number={1},
    pages={1--191},
    year={1972},
    publisher={Elsevier}
}

@article{chomsky1968linguistic,
    title={Linguistic contributions to the study of mind (Future)},
    author={Chomsky, Noam},
    journal={Language and thinking},
    pages={323--364},
    year={1968},
    publisher={Penguin Middlesex}
}

@book{skinner2014verbal,
    title={Verbal behavior},
    author={Skinner, Burrhus Frederic},
    year={2014},
    publisher={BF Skinner Foundation}
}

@inproceedings{perfors2006poverty,
    title={Poverty of the stimulus? A rational approach},
    author={Perfors, Amy and Tenenbaum, Josh and Regier, Terry},
    booktitle={Annual Conference},
    year={2006}
}

@article{schmidhuber2015deep,
    title={Deep learning in neural networks: An overview},
    author={Schmidhuber, J{\"u}rgen},
    journal={Neural Networks},
    volume={61},
    pages={85--117},
    year={2015},
    publisher={Pergamon}
}

@article{lecun2015deep,
    title={Deep learning},
    author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
    journal={Nature},
    volume={521},
    number={7553},
    pages={436--444},
    year={2015},
    publisher={Nature Publishing Group}
}

@book{Vapnik1995,
    author = {Vapnik, Vladimir},
    title = {The Nature of Statistical Learning Theory},
    year = {1995},
    publisher = {Springer-Verlag New York, Inc.},
    address = {New York, NY, USA},
} 

@book{Fletcher1987,
    author = {Fletcher, R.},
    title = {Practical Methods of Optimization},
    edition = {2nd},
    year = {1987},
    publisher = {Wiley-Interscience},
    address = {New York, NY, USA},
} 

@ARTICLE{Brochu2010,
    author = {{Brochu}, E. and {Cora}, V.~M. and {de Freitas}, N.},
    title = {A Tutorial on {B}ayesian Optimization of Expensive Cost
        Functions, with Application to Active User Modeling and
            Hierarchical Reinforcement Learning},
    journal = {ar{X}iv:{\tt 1012.2599 [cs.LG]}},
    year = 2010,
    month = dec,
}

@article{Robbins1951,
    author = {Robbins, Herbert and Monro, Sutton},
    journal = {The Annals of Mathematical Statistics},
    number = {3},
    pages = {400--407},
    title = {A Stochastic Approximation Method},
    volume = {22},
    year = {1951}
}

@incollection{Bottou1998,
    author = {Bottou, L\'{e}on},
    title = {Online Algorithms and Stochastic Approximations},
    booktitle = {Online Learning and Neural Networks},
    editor = {Saad, David},
    publisher = {Cambridge University Press},
    address = {Cambridge, UK},
    year = {1998},
}

@incollection{Lecun1998a,
    author = {{LeCun}, Y. and Bottou, L. and Orr, G. and M\"{u}ller, K.
        R.},
    booktitle = {Neural Networks: Tricks of the Trade},
    editor = {Orr, G. and M\"{u}ller, K.},
    journal = {Lecture Notes in Computer Science},
    pages = {5--50},
    publisher = {Springer Verlag},
    series = {Lecture Notes in Computer Science},
    title = {Efficient {BackProp}},
    volume = {1524},
    year = {1998}
}


@inproceedings{lowe1999object,
    title={Object recognition from local scale-invariant features},
    author={Lowe, David G},
    booktitle={Computer vision, 1999. The proceedings of the seventh IEEE
        international conference on},
    volume={2},
    pages={1150--1157},
    year={1999},
    organization={Ieee}
}

@article{Cover1965,
    author = {Cover, T. M.},
    booktitle = {IEEE Transactions on Electronic Computers},
    journal = {IEEE Transactions on Electronic Computers},
    number = {3},
    pages = {326--334},
    title = {Geometrical and Statistical Properties of Systems of Linear Inequalities with Applications in Pattern Recognition},
    volume = {EC-14},
    year = {1965}
}

@incollection{Bridle1990,
    title = {Training Stochastic Model Recognition Algorithms as Networks can
        Lead to Maximum Mutual Information Estimation of Parameters},
    author = {John S. Bridle},
    booktitle = {Advances in Neural Information Processing Systems 2},
    editor = {D.S. Touretzky},
    pages = {211--217},
    year = {1990},
    publisher = {Morgan-Kaufmann},
}


@inproceedings{denker1991transforming,
    title={Transforming neural-net output levels to probability
        distributions},
    author={Denker, John and Lecun, Yann},
    booktitle={Advances in Neural Information Processing Systems 3},
    year={1991},
    organization={Citeseer}
}

@article{bishop1994mixture,
    title={Mixture density networks},
    author={Bishop, Christopher M},
    year={1994},
    publisher={Aston University}
}


@book{kuhn2012structure,
    title={The structure of scientific revolutions},
    author={Kuhn, Thomas S},
    year={2012},
    publisher={University of Chicago press}
}

@article{Huang2006,
    title = {Extreme learning machine: Theory and applications},
    journal = {Neurocomputing},
    volume = {70},
    number = {1–-3},
    pages = {489--501},
    year = {2006},
    author = {Guang-Bin Huang and Qin-Yu Zhu and Chee-Kheong Siew},
}

@book{Rosenblatt1962,
    title={Principles of neurodynamics: perceptrons and the theory of brain
        mechanisms},
    author={Rosenblatt, F.},
    series={Report (Cornell Aeronautical Laboratory)},
    year={1962},
    publisher={Spartan Books}
}

@article{Rumelhart1986,
    author = {Rumelhart, D. E. and Hinton, G.  and Williams, R. J.},
    journal = {Nature},
    number = {Oct},
    pages = {533--536},
    title = {Learning representations by back-propagating errors},
    volume = {323},
    year = {1986}
}


@article{petersen2008matrix,
    title={The Matrix Cookbook},
    author={Petersen, Kaare Brandt and Pedersen, Michael Syskind and
        others},
    journal={Technical University of Denmark},
    volume={7},
    pages={15},
    year={2008}
}

@inproceedings{bergstra2010theano,
    title={Theano: a CPU and GPU math expression compiler},
    author={Bergstra, James and Breuleux, Olivier and Bastien,
        Fr{\'e}d{\'e}ric and Lamblin, Pascal and Pascanu, Razvan and
            Desjardins, Guillaume and Turian, Joseph and Warde-Farley, David
            and Bengio, Yoshua},
    booktitle={Proceedings of the Python for scientific computing
        conference (SciPy)},
    volume={4},
    pages={3},
    year={2010},
    organization={Austin, TX}
}

@article{bastien2012theano,
    title={Theano: new features and speed improvements},
    author={Bastien, Fr{\'e}d{\'e}ric and Lamblin, Pascal and Pascanu,
        Razvan and Bergstra, James and Goodfellow, Ian and Bergeron, Arnaud
            and Bouchard, Nicolas and Warde-Farley, David and Bengio,
        Yoshua},
    journal={arXiv preprint arXiv:1211.5590},
    year={2012}
}

@inproceedings{nair2010rectified,
    title={Rectified linear units improve restricted boltzmann machines},
    author={Nair, Vinod and Hinton, Geoffrey E},
    booktitle={Proceedings of the 27th International Conference on Machine
        Learning (ICML-10)},
    pages={807--814},
    year={2010}
}

@inproceedings{glorot2011deep,
    title={Deep sparse rectifier neural networks},
    author={Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
    booktitle={International Conference on Artificial Intelligence and
        Statistics},
    pages={315--323},
    year={2011}
}

@inproceedings{goodfellow2013maxout,
    title={Maxout Networks},
    author={Goodfellow, Ian and Warde-farley, David and Mirza, Mehdi and
        Courville, Aaron and Bengio, Yoshua},
    booktitle={Proceedings of the 30th International Conference on Machine
        Learning (ICML-13)},
    pages={1319--1327},
    year={2013}
}

@article{bengio2013estimating,
    title={Estimating or propagating gradients through stochastic neurons for
        conditional computation},
    author={Bengio, Yoshua and L{\'e}onard, Nicholas and Courville, Aaron},
    journal={arXiv preprint arXiv:1308.3432},
    year={2013}
}

@article{he2015delving,
    title={Delving deep into rectifiers: Surpassing human-level performance on
        imagenet classification},
    author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
    journal={arXiv preprint arXiv:1502.01852},
    year={2015}
}

@article{le2015simple,
    title={A Simple Way to Initialize Recurrent Networks of Rectified Linear
        Units},
    author={Le, Quoc V and Jaitly, Navdeep and Hinton, Geoffrey E},
    journal={arXiv preprint arXiv:1504.00941},
    year={2015}
}

@inproceedings{bengio2013advances,
    title={Advances in optimizing recurrent networks},
    author={Bengio, Yoshua and Boulanger-Lewandowski, Nicolas and Pascanu,
        Razvan},
    booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE
        International Conference on},
    pages={8624--8628},
    year={2013},
    organization={IEEE}
}

@inproceedings{Cho-et-al-EMNLP2014,
    author = {Kyunghyun Cho and Bart van Merrienboer and Caglar Gulcehre and Fethi
        Bougares and Holger Schwenk and Yoshua Bengio},
    title = {Learning Phrase Representations using {RNN} Encoder-Decoder for
        Statistical Machine Translation},
    booktitle = {Proceedings of the Empiricial Methods in Natural Language
        Processing (EMNLP 2014)},
    year = 2014,
    month = oct,
}


@article{hochreiter1997long,
    title={Long short-term memory},
    author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
    journal={Neural computation},
    volume={9},
    number={8},
    pages={1735--1780},
    year={1997},
    publisher={MIT Press}
}

@phdthesis{gers2001long,
    title={Long short-term memory in recurrent neural networks},
    author={Gers, Felix},
    school={{\'E}cole Polytechnique
        F{\'e}d{\'e}rale de Lausanne, Lausanne, Switzerland},
    year={2001},
}

@article{gers2000learning,
    title={Learning to forget: Continual prediction with LSTM},
    author={Gers, Felix A and Schmidhuber, J{\"u}rgen and Cummins, Fred},
    journal={Neural computation},
    volume={12},
    number={10},
    pages={2451--2471},
    year={2000},
    publisher={MIT Press}
}

@article{greff2015lstm,
    title={LSTM: A Search Space Odyssey},
    author={Greff, Klaus and Srivastava, Rupesh Kumar and Koutn{\'\i}k, Jan
        and Steunebrink, Bas R and Schmidhuber, J{\"u}rgen},
    journal={arXiv preprint arXiv:1503.04069},
    year={2015}
}

@inproceedings{jozefowicz2015empirical,
    title={An Empirical Exploration of Recurrent Network Architectures},
    author={Jozefowicz, Rafal and Zaremba, Wojciech and Sutskever, Ilya},
    booktitle={Proceedings of the 32nd International Conference on Machine
        Learning (ICML-15)},
    pages={2342--2350},
    year={2015}
}


@inproceedings{krizhevsky2012imagenet,
    title={Imagenet classification with deep convolutional neural networks},
    author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
    booktitle={Advances in neural information processing systems},
    pages={1097--1105},
    year={2012}
}


@article{bengio1994learning,
    title={Learning long-term dependencies with gradient descent is
        difficult},
    author={Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
    journal={Neural Networks, IEEE Transactions on},
    volume={5},
    number={2},
    pages={157--166},
    year={1994},
    publisher={IEEE}
}

@book{hochreiter2001gradient,
    title={Gradient flow in recurrent nets: the difficulty of learning
        long-term dependencies},
    author={Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo and
        Schmidhuber, J{\"u}rgen},
    volume={1},
    year={2001},
    publisher={A field guide to dynamical recurrent neural networks.
        IEEE Press}
}

@inproceedings{pascanu2013difficulty,
    title={On the difficulty of training recurrent neural networks},
    author={Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua},
    booktitle={Proceedings of The 30th International Conference on Machine
        Learning},
    pages={1310--1318},
    year={2013}
}

@book{chomsky2002syntactic,
    title={Syntactic structures},
    author={Chomsky, Noam},
    year={2002},
    publisher={Walter de Gruyter}
}

@inproceedings{chen1996empirical,
    title={An empirical study of smoothing techniques for language modeling},
    author={Chen, Stanley F and Goodman, Joshua},
    booktitle={Proceedings of the 34th annual meeting on Association for
        Computational Linguistics},
    pages={310--318},
    year={1996},
    organization={Association for Computational Linguistics}
}

@article{witten1991zero,
    title={The zero-frequency problem: Estimating the probabilities of novel
        events in adaptive text compression},
    author={Witten, Ian H and Bell, Timothy C},
    journal={Information Theory, IEEE Transactions on},
    volume={37},
    number={4},
    pages={1085--1094},
    year={1991},
    publisher={IEEE}
}

@inproceedings{kneser1995improved,
    title={Improved backing-off for m-gram language modeling},
    author={Kneser, Reinhard and Ney, Hermann},
    booktitle={Acoustics, Speech, and Signal Processing, 1995. ICASSP-95.,
        1995 International Conference on},
    volume={1},
    pages={181--184},
    year={1995},
    organization={IEEE}
}

@inproceedings{Heafield-estimate,
    author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and
        Philipp Koehn},
    title = {Scalable Modified {Kneser-Ney} Language Model Estimation},
    year = {2013},
    month = {August},
    booktitle = {Proceedings of the 51st Annual Meeting of the
        Association for Computational Linguistics},
    address = {Sofia, Bulgaria},
    pages = {690--696},
}

@article{kim2015character,
      title={Character-Aware Neural Language Models},
        author={Kim, Yoon and Jernite, Yacine and Sontag, David and Rush,
            Alexander M},
          journal={arXiv preprint arXiv:1508.06615},
            year={2015}
}

@article{sennrich2015neural,
      title={Neural Machine Translation of Rare Words with Subword Units},
        author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
          journal={arXiv preprint arXiv:1508.07909},
            year={2015}
}

@article{jernite2015fast,
      title={A Fast Variational Approach for Learning Markov Random Field Language Models},
        author={Jernite, Yacine and Rush, Alexander M and Sontag, David},
          journal={32nd International Conference on Machine Learning (ICML)},
            year={2015}
}


@incollection{bengio2006neural,
    title={Neural probabilistic language models},
    author={Bengio, Yoshua and Schwenk, Holger and Sen{\'e}cal, Jean-S{\'e}bastien and Morin, Fr{\'e}deric and Gauvain, Jean-Luc},
    booktitle={Innovations in Machine Learning},
    pages={137--186},
    year={2006},
    publisher={Springer Berlin Heidelberg}
}

@article{firth1957,
      title={A synopsis of linguistic theory 1930-1955},
      author={Firth, J. R.},
      journal={Oxford: Philological Society},
      year={1957}
}


@inproceedings{turian2010word,
    title={Word representations: a simple and general method for
        semi-supervised learning},
    author={Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
    booktitle={Proceedings of the 48th annual meeting of the association
        for computational linguistics},
    pages={384--394},
    year={2010},
    organization={Association for Computational Linguistics}
}

@Article{VanDerMaaten08,
  author =       "Laurens {van der Maaten} and Geoffrey E. Hinton",
  title =        {Visualizing Data using t-{SNE}},
  journal =      {Journal of Machine Learning Research},
  year =         "2008",
  month =        {November},
  pages =        {2579--2605},
  volume =       {9},
}


@article{besag1975statistical,
    title={Statistical analysis of non-lattice data},
    author={Besag, Julian},
    journal={The statistician},
    pages={179--195},
    year={1975},
    publisher={JSTOR}
}

@article{mikolov2013efficient,
    title={Efficient estimation of word representations in vector space},
    author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean,
        Jeffrey},
    journal={arXiv preprint arXiv:1301.3781},
    year={2013}
}


@incollection{Omer2014,
    title = {Neural Word Embedding as Implicit Matrix Factorization},
    author = {Levy, Omer and Goldberg, Yoav},
    booktitle = {Advances in Neural Information Processing Systems 27},
    editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence and
        K.Q. Weinberger},
    pages = {2177--2185},
    year = {2014},
    publisher = {Curran Associates, Inc.},
}


@article{collobert2011natural,
    title={Natural language processing (almost) from scratch},
    author={Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and
        Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel},
    journal={The Journal of Machine Learning Research},
    volume={12},
    pages={2493--2537},
    year={2011},
    publisher={JMLR. org}
}

@Book{Chapelle2006,
    editor =   {O. Chapelle and B. Sch{\"o}lkopf and A. Zien},
    title =      {Semi-Supervised Learning},
    publisher =    {MIT Press},
    year =   2006,
    address =    {Cambridge, MA}
}

@inproceedings{mikolov2010recurrent,
    title={Recurrent neural network based language model.},
    author={Mikolov, Tomas and Karafi{\'a}t, Martin and Burget, Lukas and
        Cernock{\`y}, Jan and Khudanpur, Sanjeev},
    booktitle={INTERSPEECH 2010},
    pages={1045--1048},
    year={2010}
}


@article{sundermeyer2015feedforward,
    title={From Feedforward to Recurrent LSTM Neural Networks for Language
        Modeling},
    author={Sundermeyer, Martin and Ney, Hermann and Schluter, Ralf},
    journal={Audio, Speech, and Language Processing, IEEE/ACM Transactions
        on},
    volume={23},
    number={3},
    pages={517--529},
    year={2015},
    publisher={IEEE}
}

@article{baltescu2014pragmatic,
    title={Pragmatic Neural Language Modelling in Machine Translation},
    author={Baltescu, Paul and Blunsom, Phil},
    journal={arXiv preprint arXiv:1412.7119},
    year={2014}
}

@article{schwenk2007continuous,
    title={Continuous space language models},
    author={Schwenk, Holger},
    journal={Computer Speech \& Language},
    volume={21},
    number={3},
    pages={492--518},
    year={2007},
    publisher={Elsevier}
}

@article{baydin2015automatic,
    title={Automatic differentiation in machine learning: a survey},
    author={Baydin, Atilim Gunes and Pearlmutter, Barak A and Radul, Alexey Andreyevich},
    journal={arXiv preprint arXiv:1502.05767},
    year={2015}
}

@book{post2011,
    title={The Three Percent Problem: Rants and Responses on Publishing,
        Translation, and the Future of Reading},
    author={Chad W. Post},
    publisher = {Open Letter},
    year      = 2011,
}

@article{brown1990statistical,
      title={A statistical approach to machine translation},
        author={Brown, Peter F and Cocke, John and Pietra, Stephen A Della and
            Pietra, Vincent J Della and Jelinek, Fredrick and Lafferty, John D
                and Mercer, Robert L and Roossin, Paul S},
          journal={Computational linguistics},
            volume={16},
              number={2},
                pages={79--85},
                  year={1990},
                    publisher={MIT Press}
}

@inproceedings{koehn2005europarl,
      title={Europarl: A parallel corpus for statistical machine translation},
        author={Koehn, Philipp},
          booktitle={MT summit},
            volume={5},
              pages={79--86},
                year={2005},
                  organization={Citeseer}
}

@inproceedings{cettoloEtAl:EAMT2012,
    Address = {Trento, Italy},
    Author = {Mauro Cettolo and Christian Girardi and Marcello
        Federico},
    Booktitle = {Proceedings of the 16$^{th}$ Conference
        of the European Association for Machine
            Translation (EAMT)},
    Date = {28-30},
    Month = {May},
    Pages = {261--268},
    Title = {WIT$^3$:
        Web Inventory of
            Transcribed
            and
            Translated
            Talks},
    Year =
    {2012}
}

@article{resnik2003web,
      title={The web as a parallel corpus},
        author={Resnik, Philip and Smith, Noah A},
          journal={Computational Linguistics},
            volume={29},
              number={3},
                pages={349--380},
                  year={2003},
                    publisher={MIT Press}
}

@incollection{zhang2006automatic,
      title={Automatic acquisition of Chinese--English parallel corpus from the
          web},
        author={Zhang, Ying and Wu, Ke and Gao, Jianfeng and Vines, Phil},
          booktitle={Advances in Information Retrieval},
            pages={420--431},
              year={2006},
                publisher={Springer}
}

@inproceedings{smith2013dirt,
      title={Dirt Cheap Web-Scale Parallel Text from the Common Crawl.},
        author={Smith, Jason R and Saint-Amand, Herve and Plamada, Magdalena and
            Koehn, Philipp and Callison-Burch, Chris and Lopez, Adam},
          booktitle={ACL (1)},
            pages={1374--1383},
              year={2013}
}

@inproceedings{papineni2002bleu,
    title={BLEU: a method for automatic evaluation of machine translation},
    author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu,
        Wei-Jing},
    booktitle={Proceedings of the 40th annual meeting on association for
        computational linguistics},
    pages={311--318},
    year={2002},
    organization={Association for Computational Linguistics}
}

@inproceedings{callison2006re,
      title={Re-evaluation the Role of Bleu in Machine Translation Research.},
        author={Callison-Burch, Chris and Osborne, Miles and Koehn, Philipp},
          booktitle={EACL},
            volume={6},
              pages={249--256},
                year={2006}
}

@InProceedings{denkowski:lavie:meteor-wmt:2014,
      author    = {Michael Denkowski and Alon Lavie},
        title     = {Meteor Universal: Language Specific Translation Evaluation
            for Any Target Language},
          booktitle = {Proceedings of the EACL 2014 Workshop on Statistical
              Machine Translation},
            year      = {2014},
}

@inproceedings{snover2006study,
      title={A study of translation edit rate with targeted human annotation},
        author={Snover, Matthew and Dorr, Bonnie and Schwartz, Richard and
            Micciulla, Linnea and Makhoul, John},
          booktitle={Proceedings of association for machine translation in the
              Americas},
            pages={223--231},
              year={2006}
}

@inproceedings{sutskever2014sequence,
      title={Sequence to sequence learning with neural networks},
        author={Sutskever, Ilya and Vinyals, Oriol and Le, Quoc VV},
          booktitle={Advances in neural information processing systems},
            pages={3104--3112},
              year={2014}
}

@incollection{forcada1997recursive,
      title={Recursive hetero-associative memories for translation},
        author={Forcada, Mikel L and {\~N}eco, Ram{\'o}n P},
          booktitle={Biological and Artificial Computation: From Neuroscience to
              Technology},
            pages={453--462},
              year={1997},
                publisher={Springer}
}

@article{cho2014learning,
      title={Learning phrase representations using rnn encoder-decoder for
          statistical machine translation},
        author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre,
            Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger
                and Bengio, Yoshua},
          journal={arXiv preprint arXiv:1406.1078},
            year={2014}
}


@inproceedings{furcy2005limited,
    title={Limited discrepancy beam search},
    author={Furcy, David and Koenig, Sven},
    booktitle={IJCAI},
    pages={125--131},
    year={2005}
}

@inproceedings{zhou2005beam,
      title={Beam-Stack Search: Integrating Backtracking with Beam Search.},
        author={Zhou, Rong and Hansen, Eric A},
          booktitle={ICAPS},
            pages={90--98},
              year={2005}
}

@incollection{koehn2004pharaoh,
      title={Pharaoh: a beam search decoder for phrase-based statistical machine
          translation models},
        author={Koehn, Philipp},
          booktitle={Machine translation: From real users to research},
            pages={115--124},
              year={2004},
                publisher={Springer}
}

@book{koehn2009statistical,
      title={Statistical machine translation},
        author={Koehn, Philipp},
          year={2009},
            publisher={Cambridge University Press}
}

@article{russell1995artificial,
      title={Artificial intelligence: a modern approach},
        author={Russell, Stuart and Norvig, Peter},
          year={1995}
}

@article{cho2014properties,
      title={On the properties of neural machine translation: Encoder-decoder
          approaches},
        author={Cho, Kyunghyun and van Merri{\"e}nboer, Bart and Bahdanau,
            Dzmitry and Bengio, Yoshua},
          journal={arXiv preprint arXiv:1409.1259},
            year={2014}
}

@article{cho2015describing,
      title={Describing Multimedia Content using Attention-based
          Encoder--Decoder Networks},
        author={Cho, Kyunghyun and Courville, Aaron and Bengio, Yoshua},
          year={2015},
            publisher={IEEE}
}
@article{bahdanau2014neural,
      title={Neural machine translation by jointly learning to align and
          translate},
        author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
          journal={arXiv preprint arXiv:1409.0473},
            year={2014}
}


@article{luong2015effective,
      title={Effective Approaches to Attention-based Neural Machine
          Translation},
        author={Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D},
          journal={arXiv preprint arXiv:1508.04025},
            year={2015}
}

@article{breiman2001statistical,
      title={Statistical modeling: The two cultures (with comments and a
              rejoinder by the author)},
        author={Breiman, Leo and others},
          journal={Statistical Science},
            volume={16},
              number={3},
                pages={199--231},
                  year={2001},
                    publisher={Institute of Mathematical Statistics}
}

@article{jean2015montreal,
      title={Montreal Neural Machine Translation Systems for WMT15},
        author={Jean, S{\'e}bastien and Firat, Orhan and Cho, Kyunghyun and
            Memisevic, Roland and Bengio, Yoshua},
          journal={Proceedings of the Tenth Workshop on Statistical Machine
              Translation},
            pages={134--140},
              year={2015}
}


@inproceedings{jean2014using,
      title={On Using Very Large Target Vocabulary for Neural Machine
          Translation},
        author={Jean, S{\'e}bastien and Cho, Kyunghyun and Memisevic, Roland and
            Bengio, Yoshua},
          booktitle={ACL 2015},
            year={2014}
}

@inproceedings{Durrani2014,
  title={{E}dinburgh's phrase-based machine translation systems for {WMT}-14},
  author={Durrani, Nadir and Haddow, Barry and Koehn, Philipp and Heafield, Kenneth},
  booktitle={Proceedings of the Ninth Workshop on Statistical Machine Translation},
  pages={97--104},
  year={2014},
  publisher={Association for Computational Linguistics Baltimore, MD, USA}
}

@article{weaver1955translation,
      title={Translation},
        author={Weaver, Warren},
          journal={Machine translation of languages},
            volume={14},
              pages={15--23},
                year={1955},
                  publisher={Cambridge: Technology Press, MIT}
}

@article{ling2015character,
      title={Character-based Neural Machine Translation},
        author={Ling, Wang and Trancoso, Isabel and Dyer, Chris and Black, Alan
            W},
          journal={arXiv preprint arXiv:1511.04586},
            year={2015}
}


@inproceedings{koehn2003statistical,
      title={Statistical phrase-based translation},
        author={Koehn, Philipp and Och, Franz Josef and Marcu, Daniel},
          booktitle={Proceedings of the 2003 Conference of the North American
              Chapter of the Association for Computational Linguistics on Human
                  Language Technology-Volume 1},
            pages={48--54},
              year={2003},
                organization={Association for Computational Linguistics}
}


@inproceedings{dong2015multi,
      title={Multi-task learning for multiple language translation},
        author={Dong, Daxiang and Wu, Hua and He, Wei and Yu, Dianhai and Wang,
            Haifeng},
          year={2015},
            organization={ACL}
}


@article{luong2015,
    title={Multi-task Sequence to Sequence Learning},
    author={Minh-Thang Luong and Quoc V. Le and Ilya Sutskever and Oriol Vinyals
        and Lukasz Kaiser},
    journal={arXiv preprint ar{X}iv:1511.06114},
    year={2015}
}

@article{sermanet2013overfeat,
      title={Overfeat: Integrated recognition, localization and detection using
          convolutional networks},
        author={Sermanet, Pierre and Eigen, David and Zhang, Xiang and Mathieu,
            Micha{\"e}l and Fergus, Rob and LeCun, Yann},
              journal={arXiv preprint arXiv:1312.6229},
                year={2013}
}

@inproceedings{Kiros-et-al-ICML2014,
 title = {Multimodal Neural Language Models},
 author = {Ryan Kiros and Ruslan Salakhutdinov and Richard Zemel},
 booktitle = {ICML'2014}, 
 year = 2014,
}

@misc{Karpathy+Li-arxiv2014,
 author = {Andrej Karpathy and Fei-Fei Li},
 year = 2014,
 title = {Deep Visual-Semantic Alignments for Generating Image Descriptions},
 howpublished = {arXiv:1412.2306},
}

@misc{Mao+al-arxiv2014,
 author = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Alan L. Yuille},
 year = 2014,
 title = {Explain Images with Multimodal Recurrent Neural Networks},
 howpublished = {arXiv:1410.1090},
}

@misc{Donahue-et-al-arxiv2014,
 title = {Long-term Recurrent Convolutional Networks for Visual Recognition and Description},
 author = {Jeff Donahue and Lisa Anne Hendricks and Sergio Guadarrama and Marcus Rohrbach and Subhashini Venugopalan and Kate Saenko and Trevor Darrell},
 year = 2014,
 howpublished = {arXiv:1411.4389},
}


@misc{Fang-et-al-arxiv2014,
 title = {From captions to visual concepts and back},
 author = {Hao Fang and Saurabh Gupta and Forrest Iandola and Rupesh Srivastava and Li
                  Deng and Piotr Dollár and Jianfeng Gao and Xiaodong He and Margaret
                  Mitchell and John C. Platt and C. Lawrence Zitnick and Geoffrey
                  Zweig},
 year = 2014,
 howpublished = {arXiv:1411.4952},
}


@misc{Chen+Zitnick-arxiv2014,
 title = {Learning a Recurrent Visual Representation for Image Caption Generation},
 author = {Xinlei Chen and C. Lawrence Zitnick},
 year = 2014,
 howpublished = {arXiv:1411.5654},
}

@article{simonyan2014very,
      title={Very deep convolutional networks for large-scale image
          recognition},
        author={Simonyan, Karen and Zisserman, Andrew},
          journal={arXiv preprint arXiv:1409.1556},
            year={2014}
}

@article{szegedy2014going,
      title={Going deeper with convolutions},
        author={Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet,
            Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and
                Vanhoucke, Vincent and Rabinovich, Andrew},
          journal={arXiv preprint arXiv:1409.4842},
            year={2014}
}

@inproceedings{xu2015show,
      title={Show, Attend and Tell: Neural Image Caption Generation with Visual
          Attention},
        author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and
            Courville, Aaron and Salakhutdinov, Ruslan and Zemel, Richard and
                Bengio, Yoshua},
          booktitle={International Conference on Machine Learning},
            year={2015}
}

@article{vinyals2014show,
      title={Show and tell: A neural image caption generator},
        author={Vinyals, Oriol and Toshev, Alexander and Bengio, Samy and Erhan,
            Dumitru},
          journal={arXiv preprint arXiv:1411.4555},
            year={2014}
}

@article{kiela2014learning,
      title={Learning image embeddings using convolutional neural networks for
          improved multi-modal semantics},
        author={Kiela, Douwe and Bottou, L{\'e}on},
          journal={Proceedings of EMNLP},
            volume={2014},
              year={2014}
}


@article{weston2010large,
      title={Large scale image annotation: learning to rank with joint
          word-image embeddings},
        author={Weston, Jason and Bengio, Samy and Usunier, Nicolas},
          journal={Machine learning},
            volume={81},
              number={1},
                pages={21--35},
                  year={2010},
                    publisher={Springer US}
}

@InProceedings{VQA,
    author = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret
        Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},
    title = {VQA: Visual Question Answering},
    booktitle = {International Conference on Computer Vision (ICCV)},
    year = {2015},
}


@article{bordes2015large,
      title={Large-scale simple question answering with memory networks},
        author={Bordes, Antoine and Usunier, Nicolas and Chopra, Sumit and
            Weston, Jason},
          journal={arXiv preprint arXiv:1506.02075},
            year={2015}
}


@article{hermann2015teaching,
      title={Teaching machines to read and comprehend},
        author={Hermann, Karl Moritz and Ko{\v{c}}isk{\`y}, Tom{\'a}{\v{s}} and
            Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman,
            Mustafa and Blunsom, Phil},
          journal={arXiv preprint arXiv:1506.03340},
            year={2015}
}

@article{hill2015goldilocks,
      title={The Goldilocks Principle: Reading Children's Books with Explicit
          Memory Representations},
        author={Hill, Felix and Bordes, Antoine and Chopra, Sumit and Weston,
            Jason},
          journal={arXiv preprint arXiv:1511.02301},
            year={2015}
}


@article{wang2015larger,
      title={Larger-Context Language Modelling},
        author={Wang, Tian and Cho, Kyunghyun},
          journal={arXiv preprint arXiv:1511.03729},
            year={2015}
}

@article{ji2015document,
      title={Document Context Language Models},
        author={Ji, Yangfeng and Cohn, Trevor and Kong, Lingpeng and Dyer, Chris
            and Eisenstein, Jacob},
          journal={arXiv preprint arXiv:1511.03962},
            year={2015}
}

@article{shannon1956bandwagon,
    title={The bandwagon (edtl.)},
    author={Shannon, Claude},
    journal={IRE Transactions on Information Theory},
    volume={1},
    number={2},
    pages={3},
    year={1956}
}

@article{goldberg2015primer,
    title={A Primer on Neural Network Models for Natural Language Processing},
    author={Goldberg, Yoav},
    journal={arXiv preprint arXiv:1510.00726},
    year={2015}
}


@book{bishop2006pattern,
    author = {Bishop, Christopher M.},
    title = {Pattern Recognition and Machine Learning (Information Science and Statistics)},
    year = {2006},
    publisher = {Springer-Verlag New York, Inc.},
    address = {Secaucus, NJ, USA},
} 


@book{murphy2012machine,
      title={Machine learning: a probabilistic perspective},
        author={Murphy, Kevin P},
          year={2012},
            publisher={MIT press}
}

@article{cortes1995support,
      title={Support-vector networks},
        author={Cortes, Corinna and Vapnik, Vladimir},
          journal={Machine learning},
            volume={20},
              number={3},
                pages={273--297},
                  year={1995},
                    publisher={Springer}
}

@article{goodman2010ensemble,
      title={Ensemble samplers with affine invariance},
        author={Goodman, Jonathan and Weare, Jonathan},
          journal={Communications in applied mathematics and computational
              science},
            volume={5},
              number={1},
                pages={65--80},
                  year={2010},
                    publisher={Mathematical Sciences Publishers}
}

@book{goodfellow2016deep,
      title={Deep learning},
        author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
          year={2016},
            publisher={MIT Press}
}

@inproceedings{lee2001algorithms,
      title={Algorithms for non-negative matrix factorization},
        author={Lee, Daniel D and Seung, H Sebastian},
          booktitle={Advances in neural information processing systems},
            pages={556--562},
              year={2001}
}

@article{olshausen1997sparse,
      title={Sparse coding with an overcomplete basis set: A strategy employed
          by V1?},
        author={Olshausen, Bruno A and Field, David J},
          journal={Vision research},
            volume={37},
              number={23},
                pages={3311--3325},
                  year={1997},
                    publisher={Elsevier}
}

@book{hyvarinen2004independent,
      title={Independent component analysis},
        author={Hyv{\"a}rinen, Aapo and Karhunen, Juha and Oja, Erkki},
          volume={46},
            year={2004},
              publisher={John Wiley \& Sons}
}

@article{hinton2006reducing,
      title={Reducing the dimensionality of data with neural networks},
        author={Hinton, Geoffrey E and Salakhutdinov, Ruslan R},
          journal={science},
            volume={313},
              number={5786},
                pages={504--507},
                  year={2006},
                    publisher={American Association for the Advancement of
                        Science}
}

@inproceedings{cho2013simple,
      title={Simple sparsification improves sparse denoising autoencoders in
          denoising highly corrupted images},
        author={Cho, Kyunghyun},
          booktitle={Proceedings of the 30th international conference on machine
              learning (ICML-13)},
            pages={432--440},
              year={2013}
}

@article{salakhutdinov2009semantic,
      title={Semantic hashing},
        author={Salakhutdinov, Ruslan and Hinton, Geoffrey},
          journal={International Journal of Approximate Reasoning},
            volume={50},
              number={7},
                pages={969--978},
                  year={2009},
                    publisher={Elsevier}
}

@article{bojarski2016end,
      title={End to end learning for self-driving cars},
        author={Bojarski, Mariusz and Del Testa, Davide and Dworakowski, Daniel
            and Firner, Bernhard and Flepp, Beat and Goyal, Prasoon and Jackel,
            Lawrence D and Monfort, Mathew and Muller, Urs and Zhang, Jiakai and
                others},
          journal={arXiv preprint arXiv:1604.07316},
            year={2016}
}


@inproceedings{ross2011reduction,
      title={A Reduction of Imitation Learning and Structured Prediction to
          No-Regret Online Learning.},
        author={Ross, St{\'e}phane and Gordon, Geoffrey J and Bagnell, Drew},
          booktitle={AISTATS},
            volume={1},
              number={2},
                pages={6},
                  year={2011}
}

@inproceedings{bertsekas1996neuro,
      title={Neuro-dynamic programming},
        author={Bertsekas, Dimitri P and Tsitsiklis, John N},
          booktitle={Decision and Control, 1995., Proceedings of the 34th IEEE
              Conference on},
            year={1996},
              organization={Athena Scientific, Belmont, MA}
}

@book{scholkopf2002learning,
      title={Learning with kernels: Support vector machines, regularization,
          optimization, and beyond},
        author={Sch{\"o}lkopf, Bernhard and Smola, Alexander J},
          year={2002},
            publisher={the MIT Press}
}

@article{breiman2001random,
      title={Random forests},
        author={Breiman, Leo},
          journal={Machine learning},
            volume={45},
              number={1},
                pages={5--32},
                  year={2001},
                    publisher={Springer}
}

@article{williams2006gaussian,
      title={Gaussian processes for machine learning},
        author={Williams, Christopher KI and Rasmussen, Carl Edward},
          journal={the MIT Press},
            volume={2},
              number={3},
                pages={4},
                  year={2006}
}

@article{tipping1999probabilistic,
      title={Probabilistic principal component analysis},
        author={Tipping, Michael E and Bishop, Christopher M},
          journal={Journal of the Royal Statistical Society: Series B
              (Statistical Methodology)},
            volume={61},
              number={3},
                pages={611--622},
                  year={1999},
                    publisher={Wiley Online Library}
}

@article{roweis1998algorithms,
      title={EM algorithms for PCA and SPCA},
        author={Roweis, Sam},
          journal={Advances in neural information processing systems},
            pages={626--632},
              year={1998},
                publisher={MORGAN KAUFMANN PUBLISHERS}
}

@article{ilin2010practical,
      title={Practical approaches to principal component analysis in the
          presence of missing values},
        author={Ilin, Alexander and Raiko, Tapani},
          journal={Journal of Machine Learning Research},
            volume={11},
              number={Jul},
                pages={1957--2000},
                  year={2010}
}

@inproceedings{salakhutdinov2007probabilistic,
      title={Probabilistic matrix factorization},
        author={Salakhutdinov, Ruslan and Mnih, Andriy},
          booktitle={Neural Information Processing Systems},
            volume={21},
              year={2007}
}

@inproceedings{lawrence2004gaussian,
      title={Gaussian process latent variable models for visualisation of high
          dimensional data},
        author={Lawrence, Neil D},
          booktitle={Advances in neural information processing systems},
            pages={329--336},
              year={2004}
}