Piece.java example

Explorer
CS_Coursework-master
/* Nathaniel Lim
 * CS373 - Final Project
 * May 18, 2010
 * Implicit Imitation 
 */

package project;
import java.util.ArrayList;
import java.util.Random;


public class Piece {	
	
	protected boolean isObserver;
	protected Piece mentor;
	protected ChessBoard world;
	private Coordinate pos;
	public int [][] tally;
	public int [][][] actionTally;
	private double [][] qMap;
	public double [] vMap;
	private double [] visibleQMap;
	private final double GAMMA = 0.9;
	private final double ALPHA = 0.6;
	private double EPSILON = 0.15;
	private int MENTOR_ACTION_INDEX = -1;
	Random r = new Random();
	private int actionsTaken = 0;	
	protected PieceAction[] actions;
	
	public Coordinate getPosition(){
		return pos;
	}
	
	public void setPosition(Coordinate p){
		this.pos = p;
	}
	
	public void setMentor(Piece m){
		if (isObserver){
			this.mentor = m;
		}
	}
	
	public Piece getMentor(){
		return mentor;
	}
	
	public Piece (ChessBoard world, boolean isObserver, PieceAction[] actions, Piece mentor){
		this.mentor = mentor;
		this.world = world;
		this.isObserver = isObserver;
		this.actions = actions;
		this.tally = new int[this.world.numStates()][this.world.numStates()];
		this.actionTally = new int[this.world.numStates()][actions.length][this.world.numStates()];
		this.vMap = new double [this.world.numStates()];
		this.visibleQMap = new double[this.world.numStates()];
		//For Observers, the action set is augmented by a_m, the unknown
		if (isObserver){
			this.qMap = new double[this.world.numStates()][actions.length+1];
			this.MENTOR_ACTION_INDEX = actions.length;
			//Using this experiences of the  mentor to fill in the qFunction for the unknown mentor action;
			for (int s = 0; s < world.numStates(); s++){
				qMap[s][MENTOR_ACTION_INDEX] = mentor.visibleQMap[s];
			}
		} else {
			this.qMap = new double[this.world.numStates()][actions.length];
		}
	}	
	
	//This method will give a observer the probability transition function
	//of the mentor
	public double transitionProb(int s, int t){
		int totalTransitionsFromS = 0;
		for (int i = 0; i < tally[s].length; i++){
			totalTransitionsFromS += tally[s][i];
		}
//		if(totalTransitionsFromS == 0){
//			return 1.0/actions.length;
//		}
		//System.out.println("" + tally[s][t] + "/" + totalTransitionsFromS);
		return ((double)tally[s][t])/((double)totalTransitionsFromS);
	}
	
	//This method will give an agents, its own probability transition function
	//The probability that the agent transitions to t when the agent is in state s and chooses action a
	private double transitionProb(int s, int a, int t){
		int total_S_A = 0;
		for (int j = 0; j < actionTally[s][a].length; j++){
			total_S_A += actionTally[s][a][j];
		}
		if(total_S_A == 0){
			return 0.0;
		}
		//System.out.println("" + actionTally[s][a][t] + "/" + total_S_A);
		return ((double)actionTally[s][a][t])/((double)total_S_A);
	}
	

	public boolean isObserver() {
		return isObserver;
	}

	
	/*
	 * Implements Both Q Learning and Bellman-Backups
	 * 
	 */
	public int getNextAction() {
		double mentorExpectation = Double.NEGATIVE_INFINITY;
		double ownExpectation = Double.NEGATIVE_INFINITY;
		double bestExpectation;
		double bestActionValue;
		int bestAction;
		int s = world.getStateId(pos.getX(), pos.getY());
		actionsTaken++;
		
		//Decrement EPSILON over every action taken
		if(EPSILON > 0.0){
			EPSILON-=0.001;
		}
		
		if (world.QLEARNING){						
			//Epsilon Greedy Policy, for Q-Learning
			if (r.nextDouble() < EPSILON) {
				return r.nextInt(actions.length);
			} else {
				//Do the best action, over Q(s, a)
				bestActionValue = Double.NEGATIVE_INFINITY;
				bestAction = 0;
				for (int a = 0; a < actions.length; a++) {
					if (qMap[s][a] > bestActionValue) {
						bestActionValue = qMap[s][a];
						bestAction = a;
					}
				}
				//If the Piece is an observer, the Mentor's Unknown action might be best.
				if(isObserver()){
					int KLDa = kldMinimizer(s);					
					if (qMap[s][MENTOR_ACTION_INDEX] > qMap[s][bestAction] && KLDa != -1 && actionsTaken > 20){							
						bestAction = KLDa;						
					}
				}				
			}
			return bestAction;			
		} else {
			//System.out.println("In State " + getPosition()); 
			//Augmented Bellmann Backup (Epsilon Greedy Policy)
			if (r.nextDouble() < EPSILON) {
				return r.nextInt(actions.length);
			} else {
				//First find the best action to take, given own model
				bestAction = 0;
				ownExpectation = Double.NEGATIVE_INFINITY;
				for (int a = 0; a < actions.length; a++){
					double sum = 0.0;
					for (int t = 0; t < world.numStates(); t++){
						sum += transitionProb(s, a, t)*vMap[t];
					}
					//System.out.println("Action (" + actions[a].dx() + ", " +actions[a].dy() + ") has expectation: " + sum);
					if (sum > ownExpectation){
						ownExpectation = sum;
						bestAction = a;
					}
				}
				bestExpectation = ownExpectation;
				boolean mentorTold = false;
				if(isObserver){
					//Use the mentors information
					mentorExpectation = 0.0;
					for (int t = 0; t < world.numStates(); t++){
						mentorExpectation += mentor.transitionProb(s, t)*vMap[t];
					}
					//Find own action that minimizes the kL-Distance:
					int kld = kldMinimizer(s);
					if (mentorExpectation > bestExpectation && kld != -1){							
						bestExpectation = mentorExpectation;
						bestAction = kld;		
						mentorTold = true;
					}					
				}
				updateVFunction(s,bestAction, bestExpectation);
				//printStateValues();
				//System.out.println("In State " + getPosition() + " want to take (" + actions[bestAction].dx() + ", " +actions[bestAction].dy() + ")");
				if(mentorTold){
					//System.out.println("Because mentor told me");
				}
				return bestAction;			
			}
			
		}		
	}
	
	public static double klDistance (double[] vect1, double [] vect2){
		double sum = 0.0;
		for(int t = 0; t < vect1.length; t++){					
			double p1 = vect1[t];
			double p2 = vect2[t];
			System.out.print("( " +p1+ ", " +p2+ ") ,");
			//sum += (0.5)*(p1* Math.log(p1/p2) + p2* Math.log(p2/p1));
			if (p2 == 0){
				p2 = 0.00001;				
			} 
			sum+= -p1*Math.log(p2);
			System.out.println("Sum so far:" + sum);
		}
		
		return sum;		
	}
	
	public static void main (String [] args){
		double [] v1 = {0, 0, 0, .3, .7};
		double [] v2 = {0, 0, 0, .3, .7};
		System.out.println("KLDistance: " + klDistance(v1, v2));
		double [] v3 = {.3, .7, 0, 0, 0};
		double [] v4 = {0, 0, 0, .3, .7};
		System.out.println("KLDistance: " + klDistance(v3, v4));
		double [] v5 = {.3, .7, 0, 0, 0};
		double [] v6 = {0, .7, 0, .3, 0};
		System.out.println("KLDistance: " + klDistance(v5, v6));
	}

	private double klDistance (int s, int a){
		double sum = 0.0;
		for(int t = 0; t < world.numStates(); t++){					
			double p1 = transitionProb(s, a, t);
			double p2 = mentor.transitionProb(s, t);
			//System.out.print("( " +p1+ ", " +p2+ ") ,");
			//sum += (0.5)*(p1* Math.log(p1/p2) + p2* Math.log(p2/p1));
			if (p2 == 0){
				p2 = 0.00001;				
			} 
			sum+= -p1*Math.log(p2);
		}
		//System.out.println("Sum:" + sum);
		return sum;			
	}	


	private int kldMinimizer(int s) {	
		double smallestKLDistance = Double.POSITIVE_INFINITY;
		int closestAction = 0;
		//ArrayList<Integer> bestBunch = new ArrayList<Integer>();
		for (int a = 0; a < actions.length; a++){
			double thisDistance = klDistance(s, a);
			if (thisDistance < smallestKLDistance){				
					smallestKLDistance = thisDistance;
					closestAction = a;			
			}			
		}		
		//System.out.println("best: " + smallestKLDistance);		
		return closestAction;
	}	
	


	public void updateTally(int s, int a, int t) {
		tally[s][t]++;	
		actionTally[s][a][t]++;
	}
	public void updateVFunction(int s, int a, double bestExpectation) {
	
		double reward = world.getReward(s, a);
		//System.out.println("Updating vmap with bestExp: " + bestExpectation);
		vMap[s] = (ALPHA)*vMap[s] + (1-ALPHA)*(reward + GAMMA*bestExpectation);
	}
	
	private double perceivedStateValue(int s){
		double output = 0.0;
		if(world.QLEARNING){
			for (int a = 0; a < actions.length; a++){
				output+= qMap[s][a];
			}
		} else {
			output = vMap[s];
		}
		return output;
	}
	
	public void printStateValues(){
		for (int i = 0; i < world.sizeX(); i++){
			for(int j = 0; j < world.sizeY(); j++){
				System.out.print(perceivedStateValue(world.getStateId(i, j)) + ", ");
			}
			System.out.println("");
		}		
	}
	
	public void printQMap(int s){
		System.out.println("Going from: " + world.getStateCoords(s).getX() + ", " + world.getStateCoords(s).getY() + ")");
		int besta = 0;
		for (int a = 0; a < this.actions.length; a++){
			if (qMap[s][a] > qMap[s][besta]){
				besta = a;
			}
		}
		System.out.println("Taking Action: " + "(" + actions[besta].dx() + ", " + actions[besta].dy() + ")" + " has value: " + qMap[s][besta]);
	}

	public void updateQFunction(int s, int a, int t) {				
		
		double reward = world.getReward(s, a);
		// end the episode
		if (world.isTerminalState(s)) {
			qMap[s][a] = (ALPHA)*qMap[s][a] + (1-ALPHA)*(reward);
			visibleQMap[s] = (ALPHA)*visibleQMap[s]+ (1-ALPHA)*(reward);
		}else {
			//Maximizing next action:		
			double bestNextActionValue = Double.NEGATIVE_INFINITY;
			for (int a_Prime = 0; a_Prime < actions.length; a_Prime++){
				if (qMap[t][a_Prime] > bestNextActionValue){
					bestNextActionValue = qMap[t][a_Prime];
				}
			}
			qMap[s][a] =     (ALPHA)*qMap[s][a] +     (1-ALPHA)*(reward + GAMMA*bestNextActionValue);
			visibleQMap[s] = (ALPHA)*visibleQMap[s] + (1-ALPHA)*(reward + GAMMA*bestNextActionValue);
		}
	}

}