ModifiedPolicyEvaluation.java example

Explorer
javascript-algorithm-practice-master
package aima.core.probability.mdp.impl;

import java.util.HashMap;
import java.util.Map;

import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.probability.mdp.PolicyEvaluation;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 657.<br>
 * <br>
 * For small state spaces, policy evaluation using exact solution methods is
 * often the most efficient approach. For large state spaces, O(n<sup>3</sup>)
 * time might be prohibitive. Fortunately, it is not necessary to do exact
 * policy evaluation. Instead, we can perform some number of simplified value
 * iteration steps (simplified because the policy is fixed) to give a reasonably
 * good approximation of utilities. The simplified Bellman update for this
 * process is:<br>
 * <br>
 * 
 * <pre>
 * U<sub>i+1</sub>(s) <- R(s) + γΣ<sub>s'</sub>P(s'|s,π<sub>i</sub>(s))U<sub>i</sub>(s')
 * </pre>
 * 
 * and this is repeated k times to produce the next utility estimate. The
 * resulting algorithm is called <b>modified policy iteration</b>. It is often
 * much more efficient than standard policy iteration or value iteration.
 * 
 * 
 * @param <S>
 *            the state type.
 * @param <A>
 *            the action type.
 * 
 * @author Ciaran O'Reilly
 * @author Ravi Mohan
 * 
 */
public class ModifiedPolicyEvaluation<S, A extends Action> implements PolicyEvaluation<S, A> {
	// # iterations to use to produce the next utility estimate
	private int k;
	// discount γ to be used.
	private double gamma;

	/**
	 * Constructor.
	 * 
	 * @param k
	 *            number iterations to use to produce the next utility estimate
	 * @param gamma
	 *            discount γ to be used
	 */
	public ModifiedPolicyEvaluation(int k, double gamma) {
		if (gamma > 1.0 || gamma <= 0.0) {
			throw new IllegalArgumentException("Gamma must be > 0 and <= 1.0");
		}
		this.k = k;
		this.gamma = gamma;
	}
	
	//
	// START-PolicyEvaluation
	@Override
	public Map<S, Double> evaluate(Map<S, A> pi_i, Map<S, Double> U,
			MarkovDecisionProcess<S, A> mdp) {
		Map<S, Double> U_i = new HashMap<S, Double>(U);
		Map<S, Double> U_ip1 = new HashMap<S, Double>(U);
		// repeat k times to produce the next utility estimate
		for (int i = 0; i < k; i++) {
			// U<sub>i+1</sub>(s) <- R(s) +
			// γΣ<sub>s'</sub>P(s'|s,π<sub>i</sub>(s))U<sub>i</sub>(s')
			for (S s : U.keySet()) {
				A ap_i = pi_i.get(s);
				double aSum = 0;
				// Handle terminal states (i.e. no actions)
				if (null != ap_i) {
					for (S sDelta : U.keySet()) {
						aSum += mdp.transitionProbability(sDelta, s, ap_i)
								* U_i.get(sDelta);
					}
				}
				U_ip1.put(s, mdp.reward(s) + gamma * aSum);
			}

			U_i.putAll(U_ip1);
		}
		return U_ip1;
	}
	
	// END-PolicyEvaluation
	//
}