/* * JABM - Java Agent-Based Modeling Toolkit * Copyright (C) 2013 Steve Phelps * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 3 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. */ package net.sourceforge.jabm.learning; import net.sourceforge.jabm.prng.DiscreteProbabilityDistribution; import cern.jet.random.engine.RandomEngine; /** * <p> * An implementation of the softmax action selection policy. * </p> * * <p> * See:<br> * Sutton, R. S., Barto, A. G., 1998. Reinforcement Learning: An Introduction. * MIT Press.<br> * </p> * * @author Steve Phelps * */ public class SoftMaxActionSelector implements ActionSelector { protected RandomEngine prng; /** * The "temperature" used to modulate the propensity distribution. */ protected double tau; public SoftMaxActionSelector() { super(); } public SoftMaxActionSelector(RandomEngine prng, double tau) { super(); this.prng = prng; this.tau = tau; } @Override public int act(int state, MDPLearner learner) { double q[] = learner.getValueEstimates(state); double p[] = new double[q.length]; double total = 0; double totalQ = 0; for(int i=0; i<p.length; i++) { double propensity = Math.exp(q[i]/tau); p[i] = propensity; total += propensity; totalQ += q[i]; } if (Math.abs(totalQ) > 10e-6) { for(int i=0; i<q.length; i++) { p[i] = p[i] / total; } } else { for(int i=0; i<q.length; i++) { p[i] = 1.0/(double) q.length; } } DiscreteProbabilityDistribution dist = new DiscreteProbabilityDistribution(prng, p); return dist.generateRandomEvent(); } public RandomEngine getPrng() { return prng; } public void setPrng(RandomEngine prng) { this.prng = prng; } public double getTau() { return tau; } /** * @param tau The "temperature" used to modulate the propensities. */ public void setTau(double tau) { this.tau = tau; } }