package aima.core.probability.mdp.search; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import aima.core.agent.Action; import aima.core.probability.mdp.MarkovDecisionProcess; import aima.core.probability.mdp.Policy; import aima.core.probability.mdp.PolicyEvaluation; import aima.core.probability.mdp.impl.LookupPolicy; import aima.core.util.Util; /** * Artificial Intelligence A Modern Approach (3rd Edition): page 657.<br> * <br> * * <pre> * function POLICY-ITERATION(mdp) returns a policy * inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a) * local variables: U, a vector of utilities for states in S, initially zero * π, a policy vector indexed by state, initially random * * repeat * U <- POLICY-EVALUATION(π, U, mdp) * unchanged? <- true * for each state s in S do * if max<sub>a ∈ A(s)</sub> Σ<sub>s'</sub>P(s'|s,a)U[s'] > Σ<sub>s'</sub>P(s'|s,π[s])U[s'] then do * π[s] <- argmax<sub>a ∈ A(s)</sub> Σ<sub>s'</sub>P(s'|s,a)U[s'] * unchanged? <- false * until unchanged? * return π * </pre> * * Figure 17.7 The policy iteration algorithm for calculating an optimal policy. * * @param <S> * the state type. * @param <A> * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PolicyIteration<S, A extends Action> { private PolicyEvaluation<S, A> policyEvaluation = null; /** * Constructor. * * @param policyEvaluation * the policy evaluation function to use. */ public PolicyIteration(PolicyEvaluation<S, A> policyEvaluation) { this.policyEvaluation = policyEvaluation; } // function POLICY-ITERATION(mdp) returns a policy /** * The policy iteration algorithm for calculating an optimal policy. * * @param mdp * an MDP with states S, actions A(s), transition model P(s'|s,a) * @return an optimal policy */ public Policy<S, A> policyIteration(MarkovDecisionProcess<S, A> mdp) { // local variables: U, a vector of utilities for states in S, initially // zero Map<S, Double> U = Util.create(mdp.states(), new Double(0)); // π, a policy vector indexed by state, initially random Map<S, A> pi = initialPolicyVector(mdp); boolean unchanged; // repeat do { // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // unchanged? <- true unchanged = true; // for each state s in S do for (S s : mdp.states()) { // calculate: // max<sub>a ∈ A(s)</sub> // Σ<sub>s'</sub>P(s'|s,a)U[s'] double aMax = Double.NEGATIVE_INFINITY, piVal = 0; A aArgmax = pi.get(s); for (A a : mdp.actions(s)) { double aSum = 0; for (S sDelta : mdp.states()) { aSum += mdp.transitionProbability(sDelta, s, a) * U.get(sDelta); } if (aSum > aMax) { aMax = aSum; aArgmax = a; } // track: // Σ<sub>s'</sub>P(s'|s,π[s])U[s'] if (a.equals(pi.get(s))) { piVal = aSum; } } // if max<sub>a ∈ A(s)</sub> // Σ<sub>s'</sub>P(s'|s,a)U[s'] // > Σ<sub>s'</sub>P(s'|s,π[s])U[s'] then do if (aMax > piVal) { // π[s] <- argmax<sub>a ∈A(s)</sub> // Σ<sub>s'</sub>P(s'|s,a)U[s'] pi.put(s, aArgmax); // unchanged? <- false unchanged = false; } } // until unchanged? } while (!unchanged); // return π return new LookupPolicy<S, A>(pi); } /** * Create a policy vector indexed by state, initially random. * * @param mdp * an MDP with states S, actions A(s), transition model P(s'|s,a) * @return a policy vector indexed by state, initially random. */ public static <S, A extends Action> Map<S, A> initialPolicyVector( MarkovDecisionProcess<S, A> mdp) { Map<S, A> pi = new LinkedHashMap<S, A>(); List<A> actions = new ArrayList<A>(); for (S s : mdp.states()) { actions.clear(); actions.addAll(mdp.actions(s)); // Handle terminal states (i.e. no actions). if (actions.size() > 0) { pi.put(s, Util.selectRandomlyFromList(actions)); } } return pi; } }