package aima.core.probability.mdp.impl; import java.util.HashMap; import java.util.Map; import aima.core.agent.Action; import aima.core.probability.mdp.MarkovDecisionProcess; import aima.core.probability.mdp.PolicyEvaluation; /** * Artificial Intelligence A Modern Approach (3rd Edition): page 657.<br> * <br> * For small state spaces, policy evaluation using exact solution methods is * often the most efficient approach. For large state spaces, O(n<sup>3</sup>) * time might be prohibitive. Fortunately, it is not necessary to do exact * policy evaluation. Instead, we can perform some number of simplified value * iteration steps (simplified because the policy is fixed) to give a reasonably * good approximation of utilities. The simplified Bellman update for this * process is:<br> * <br> * * <pre> * U<sub>i+1</sub>(s) <- R(s) + γΣ<sub>s'</sub>P(s'|s,π<sub>i</sub>(s))U<sub>i</sub>(s') * </pre> * * and this is repeated k times to produce the next utility estimate. The * resulting algorithm is called <b>modified policy iteration</b>. It is often * much more efficient than standard policy iteration or value iteration. * * * @param <S> * the state type. * @param <A> * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class ModifiedPolicyEvaluation<S, A extends Action> implements PolicyEvaluation<S, A> { // # iterations to use to produce the next utility estimate private int k; // discount γ to be used. private double gamma; /** * Constructor. * * @param k * number iterations to use to produce the next utility estimate * @param gamma * discount γ to be used */ public ModifiedPolicyEvaluation(int k, double gamma) { if (gamma > 1.0 || gamma <= 0.0) { throw new IllegalArgumentException("Gamma must be > 0 and <= 1.0"); } this.k = k; this.gamma = gamma; } // // START-PolicyEvaluation @Override public Map<S, Double> evaluate(Map<S, A> pi_i, Map<S, Double> U, MarkovDecisionProcess<S, A> mdp) { Map<S, Double> U_i = new HashMap<S, Double>(U); Map<S, Double> U_ip1 = new HashMap<S, Double>(U); // repeat k times to produce the next utility estimate for (int i = 0; i < k; i++) { // U<sub>i+1</sub>(s) <- R(s) + // γΣ<sub>s'</sub>P(s'|s,π<sub>i</sub>(s))U<sub>i</sub>(s') for (S s : U.keySet()) { A ap_i = pi_i.get(s); double aSum = 0; // Handle terminal states (i.e. no actions) if (null != ap_i) { for (S sDelta : U.keySet()) { aSum += mdp.transitionProbability(sDelta, s, ap_i) * U_i.get(sDelta); } } U_ip1.put(s, mdp.reward(s) + gamma * aSum); } U_i.putAll(U_ip1); } return U_ip1; } // END-PolicyEvaluation // }