package aima.core.learning.reinforcement.agent; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Set; import aima.core.agent.Action; import aima.core.learning.reinforcement.PerceptStateReward; import aima.core.probability.mdp.ActionsFunction; import aima.core.probability.mdp.PolicyEvaluation; import aima.core.probability.mdp.RewardFunction; import aima.core.probability.mdp.TransitionProbabilityFunction; import aima.core.probability.mdp.impl.MDP; import aima.core.util.FrequencyCounter; import aima.core.util.datastructure.Pair; /** * Artificial Intelligence A Modern Approach (3rd Edition): page 834.<br> * <br> * * <pre> * function PASSIVE-ADP-AGENT(percept) returns an action * inputs: percept, a percept indicating the current state s' and reward signal r' * persistent: π, a fixed policy * mdp, an MDP with model P, rewards R, discount γ * U, a table of utilities, initially empty * N<sub>sa</sub>, a table of frequencies for state-action pairs, initially zero * N<sub>s'|sa</sub>, a table of outcome frequencies give state-action pairs, initially zero * s, a, the previous state and action, initially null * * if s' is new then U[s'] <- r'; R[s'] <- r' * if s is not null then * increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a] * for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do * P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] / N<sub>sa</sub>[s,a] * U <- POLICY-EVALUATION(π, U, mdp) * if s'.TERMINAL? then s,a <- null else s,a <- s',π[s'] * return a * </pre> * * Figure 21.2 A passive reinforcement learning agent based on adaptive dynamic * programming. The POLICY-EVALUATION function solves the fixed-policy Bellman * equations, as described on page 657. * * @param <S> * the state type. * @param <A> * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PassiveADPAgent<S, A extends Action> extends ReinforcementAgent<S, A> { // persistent: π, a fixed policy private Map<S, A> pi = new HashMap<S, A>(); // mdp, an MDP with model P, rewards R, discount γ private MDP<S, A> mdp = null; private Map<Pair<S, Pair<S, A>>, Double> P = new HashMap<Pair<S, Pair<S, A>>, Double>(); private Map<S, Double> R = new HashMap<S, Double>(); private PolicyEvaluation<S, A> policyEvaluation = null; // U, a table of utilities, initially empty private Map<S, Double> U = new HashMap<S, Double>(); // N<sub>sa</sub>, a table of frequencies for state-action pairs, initially // zero private FrequencyCounter<Pair<S, A>> Nsa = new FrequencyCounter<Pair<S, A>>(); // N<sub>s'|sa</sub>, a table of outcome frequencies give state-action // pairs, initially zero private FrequencyCounter<Pair<S, Pair<S, A>>> NsDelta_sa = new FrequencyCounter<Pair<S, Pair<S, A>>>(); // s, a, the previous state and action, initially null private S s = null; private A a = null; /** * Constructor. * * @param fixedPolicy * π a fixed policy. * @param states * the possible states in the world (i.e. fully observable). * @param initialState * the initial state for the agent. * @param actionsFunction * a function that lists the legal actions from a state. * @param policyEvaluation * a function for evaluating a policy. */ public PassiveADPAgent(Map<S, A> fixedPolicy, Set<S> states, S initialState, ActionsFunction<S, A> actionsFunction, PolicyEvaluation<S, A> policyEvaluation) { this.pi.putAll(fixedPolicy); this.mdp = new MDP<S, A>(states, initialState, actionsFunction, new TransitionProbabilityFunction<S, A>() { public double probability(S sDelta, S s, A a) { Double p = P.get(new Pair<S, Pair<S, A>>(sDelta, new Pair<S, A>(s, a))); return null == p ? 0.0 : p.doubleValue(); } }, new RewardFunction<S>() { public double reward(S s) { return R.get(s); } }); this.policyEvaluation = policyEvaluation; } /** * Passive reinforcement learning based on adaptive dynamic programming. * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ @Override public A execute(PerceptStateReward<S> percept) { // if s' is new then U[s'] <- r'; R[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.containsKey(sDelta)) { U.put(sDelta, rDelta); R.put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a] Pair<S, A> sa = new Pair<S, A>(s, a); Nsa.incrementFor(sa); NsDelta_sa.incrementFor(new Pair<S, Pair<S, A>>(sDelta, sa)); // for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do for (S t : mdp.states()) { Pair<S, Pair<S, A>> t_sa = new Pair<S, Pair<S, A>>(t, sa); if (0 != NsDelta_sa.getCount(t_sa)) { // P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] / // N<sub>sa</sub>[s,a] P.put(t_sa, NsDelta_sa.getCount(t_sa).doubleValue() / Nsa.getCount(sa).doubleValue()); } } } // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // if s'.TERMINAL? then s,a <- null else s,a <- s',π[s'] if (isTerminal(sDelta)) { s = null; a = null; } else { s = sDelta; a = pi.get(sDelta); } // return a return a; } @Override public Map<S, Double> getUtility() { return Collections.unmodifiableMap(U); } @Override public void reset() { P.clear(); R.clear(); U = new HashMap<S, Double>(); Nsa.clear(); NsDelta_sa.clear(); s = null; a = null; } // // PRIVATE METHODS // private boolean isTerminal(S s) { boolean terminal = false; if (0 == mdp.actions(s).size()) { // No actions possible in state is considered terminal. terminal = true; } return terminal; } }