package aima.core.learning.reinforcement.agent; import java.util.HashMap; import java.util.Map; import aima.core.agent.Action; import aima.core.learning.reinforcement.PerceptStateReward; import aima.core.util.FrequencyCounter; /** * Artificial Intelligence A Modern Approach (3rd Edition): page 837.<br> * <br> * * <pre> * function PASSIVE-TD-AGENT(percept) returns an action * inputs: percept, a percept indicating the current state s' and reward signal r' * persistent: π, a fixed policy * U, a table of utilities, initially empty * N<sub>s</sub>, a table of frequencies for states, initially zero * s,a,r, the previous state, action, and reward, initially null * * if s' is new then U[s'] <- r' * if s is not null then * increment N<sub>s</sub>[s] * U[s] <- U[s] + α(N<sub>s</sub>[s])(r + γU[s'] - U[s]) * if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r' * return a * </pre> * * Figure 21.4 A passive reinforcement learning agent that learns utility * estimates using temporal differences. The step-size function α(n) is * chosen to ensure convergence, as described in the text. * * @param <S> * the state type. * @param <A> * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PassiveTDAgent<S, A extends Action> extends ReinforcementAgent<S, A> { // persistent: π, a fixed policy private Map<S, A> pi = new HashMap<S, A>(); // U, a table of utilities, initially empty private Map<S, Double> U = new HashMap<S, Double>(); // N<sub>s</sub>, a table of frequencies for states, initially zero private FrequencyCounter<S> Ns = new FrequencyCounter<S>(); // s,a,r, the previous state, action, and reward, initially null private S s = null; private A a = null; private Double r = null; // private double alpha = 0.0; private double gamma = 0.0; /** * Constructor. * * @param fixedPolicy * π a fixed policy. * @param alpha * a fixed learning rate. * @param gamma * discount to be used. */ public PassiveTDAgent(Map<S, A> fixedPolicy, double alpha, double gamma) { this.pi.putAll(fixedPolicy); this.alpha = alpha; this.gamma = gamma; } /** * Passive reinforcement learning that learns utility estimates using * temporal differences * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ @Override public A execute(PerceptStateReward<S> percept) { // if s' is new then U[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.containsKey(sDelta)) { U.put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N<sub>s</sub>[s] Ns.incrementFor(s); // U[s] <- U[s] + α(N<sub>s</sub>[s])(r + γU[s'] - U[s]) double U_s = U.get(s); U.put(s, U_s + alpha(Ns, s) * (r + gamma * U.get(sDelta) - U_s)); } // if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r' if (isTerminal(sDelta)) { s = null; a = null; r = null; } else { s = sDelta; a = pi.get(sDelta); r = rDelta; } // return a return a; } @Override public Map<S, Double> getUtility() { return new HashMap<S, Double>(U); } @Override public void reset() { U = new HashMap<S, Double>(); Ns.clear(); s = null; a = null; r = null; } // // PROTECTED METHODS // /** * AIMA3e pg. 836 'if we change α from a fixed parameter to a function * that decreases as the number of times a state has been visited increases, * then U<sup>π</sup>(s) itself will converge to the correct value.<br> * <br> * <b>Note:</b> override this method to obtain the desired behavior. * * @param Ns * a frequency counter of observed states. * @param s * the current state. * @return the learning rate to use based on the frequency of the state * passed in. */ protected double alpha(FrequencyCounter<S> Ns, S s) { // Default implementation is just to return a fixed parameter value // irrespective of the # of times a state has been encountered return alpha; } // // PRIVATE METHODS // private boolean isTerminal(S s) { boolean terminal = false; Action a = pi.get(s); if (null == a || a.isNoOp()) { // No actions possible in state is considered terminal. terminal = true; } return terminal; } }