@misc{ipodnorvig,
    title        = "Doing the Martin Shuffle (with your iPod)",
    author = "Peter Norvig",
    howpublished = "\url{http://norvig.com/ipod.html}"
}

@misc{viconvergence,
    title        = "The Convergence of a General Value Iteration Process",
    howpublished = "\url{http://jmlr.csail.mit.edu/papers/volume3/szita02a/html/node21.html}"
}

@misc{sail,
    title        = "Sailing Strategies: An Application Involving Stochastics, Optimization, and Statistics (SOS) 
",
    author = "Robert J. Vanderbei",
    howpublished = "\url{http://www.orfe.princeton.edu/~rvdb/sail/sail.html}"
}

@inproceedings{citeulike:2976742,
    abstract = {The UCT algorithm learns a value function online using sample-based search. The TD(λ) algorithm can learn a value function offline for the on-policy distribution. We consider three approaches for combining offline and online value functions in the UCT algorithm. First, the offline value function is used as a default policy during Monte-Carlo simulation. Second, the UCT value function is combined with a rapid online estimate of action values. Third, the offline value function is used as prior knowledge in the UCT search tree. We evaluate these algorithms in 9 x 9 Go against GnuGo 3.7.10. The first algorithm performs better than UCT with a random simulation policy, but surprisingly, worse than UCT with a weaker, handcrafted simulation policy. The second algorithm outperforms UCT altogether. The third algorithm outperforms UCT with handcrafted prior knowledge. We combine these algorithms in MoGo, the world's strongest 9 x 9 Go program. Each technique significantly improves MoGo's playing strength.},
    address = {New York, NY, USA},
    author = {Gelly, Sylvain and Silver, David},
    booktitle = {ICML '07: Proceedings of the 24th international conference on Machine learning},
    citeulike-article-id = {2976742},
    citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1273496.1273531},
    citeulike-linkout-1 = {http://dx.doi.org/10.1145/1273496.1273531},
    citeulike-linkout-2 = {http://hal.archives-ouvertes.fr/docs/00/16/40/03/PDF/GellySilverICML2007.pdf},
    doi = {10.1145/1273496.1273531},
    isbn = {9781595937933},
    keywords = {computer-go, game, monte-carlo, tree-search},
    location = {Corvalis, Oregon},
    pages = {273--280},
    posted-at = {2008-07-09 15:15:54},
    priority = {2},
    publisher = {ACM},
    title = {Combining online and offline knowledge in UCT},
    url = {http://dx.doi.org/10.1145/1273496.1273531},
    year = {2007}
}

	
@INPROCEEDINGS{Kocsis06banditbased,
    author = {Levente Kocsis and Csaba Szepesvári},
    title = {Bandit based Monte-Carlo Planning},
    booktitle = {In: ECML-06. Number 4212 in LNCS},
    year = {2006},
    pages = {282--293},
    publisher = {Springer}
}

@article{Auer:2002,
    abstract = {Reinforcement learning policies face the exploration versus exploitation dilemma, i.e. the search for a balance between exploring the environment to find profitable actions while taking the empirically best action as often as possible. A popular measure of a policy's success in addressing this dilemma is the regret, that is the loss due to the fact that the globally optimal policy is not followed all the times. One of the simplest examples of the exploration/exploitation dilemma is the multi-armed bandit problem. Lai and Robbins were the first ones to show that the regret for this problem has to grow at least logarithmically in the number of plays. Since then, policies which asymptotically achieve this regret have been devised by Lai and Robbins and many others. In this work we show that the optimal logarithmic regret is also achievable uniformly over time, with simple and efficient policies, and for all reward distributions with bounded support.},
    author = {Auer, Peter and Cesa-Bianchi, Nicol\`{o} and Fischer, Paul},
    citeulike-article-id = {2521953},
    citeulike-linkout-0 = {http://dx.doi.org/10.1023/A:1013689704352},
    doi = {10.1023/A:1013689704352},
    journal = {Machine Learning},
    keywords = {bandit, monte\_carlo, stochastic\_bandit, ucb, uct},
    month = {May},
    number = {2},
    pages = {235--256},
    posted-at = {2009-05-19 20:44:24},
    priority = {2},
    title = {Finite-time Analysis of the Multiarmed Bandit Problem},
    url = {http://dx.doi.org/10.1023/A:1013689704352},
    volume = {47},
    year = {2002}
}
