/* Nathaniel Lim
* CS373 - Final Project
* May 18, 2010
* Implicit Imitation
*/
package project;
import java.util.ArrayList;
import java.util.Random;
public class Piece {
protected boolean isObserver;
protected Piece mentor;
protected ChessBoard world;
private Coordinate pos;
public int [][] tally;
public int [][][] actionTally;
private double [][] qMap;
public double [] vMap;
private double [] visibleQMap;
private final double GAMMA = 0.9;
private final double ALPHA = 0.6;
private double EPSILON = 0.15;
private int MENTOR_ACTION_INDEX = -1;
Random r = new Random();
private int actionsTaken = 0;
protected PieceAction[] actions;
public Coordinate getPosition(){
return pos;
}
public void setPosition(Coordinate p){
this.pos = p;
}
public void setMentor(Piece m){
if (isObserver){
this.mentor = m;
}
}
public Piece getMentor(){
return mentor;
}
public Piece (ChessBoard world, boolean isObserver, PieceAction[] actions, Piece mentor){
this.mentor = mentor;
this.world = world;
this.isObserver = isObserver;
this.actions = actions;
this.tally = new int[this.world.numStates()][this.world.numStates()];
this.actionTally = new int[this.world.numStates()][actions.length][this.world.numStates()];
this.vMap = new double [this.world.numStates()];
this.visibleQMap = new double[this.world.numStates()];
//For Observers, the action set is augmented by a_m, the unknown
if (isObserver){
this.qMap = new double[this.world.numStates()][actions.length+1];
this.MENTOR_ACTION_INDEX = actions.length;
//Using this experiences of the mentor to fill in the qFunction for the unknown mentor action;
for (int s = 0; s < world.numStates(); s++){
qMap[s][MENTOR_ACTION_INDEX] = mentor.visibleQMap[s];
}
} else {
this.qMap = new double[this.world.numStates()][actions.length];
}
}
//This method will give a observer the probability transition function
//of the mentor
public double transitionProb(int s, int t){
int totalTransitionsFromS = 0;
for (int i = 0; i < tally[s].length; i++){
totalTransitionsFromS += tally[s][i];
}
// if(totalTransitionsFromS == 0){
// return 1.0/actions.length;
// }
//System.out.println("" + tally[s][t] + "/" + totalTransitionsFromS);
return ((double)tally[s][t])/((double)totalTransitionsFromS);
}
//This method will give an agents, its own probability transition function
//The probability that the agent transitions to t when the agent is in state s and chooses action a
private double transitionProb(int s, int a, int t){
int total_S_A = 0;
for (int j = 0; j < actionTally[s][a].length; j++){
total_S_A += actionTally[s][a][j];
}
if(total_S_A == 0){
return 0.0;
}
//System.out.println("" + actionTally[s][a][t] + "/" + total_S_A);
return ((double)actionTally[s][a][t])/((double)total_S_A);
}
public boolean isObserver() {
return isObserver;
}
/*
* Implements Both Q Learning and Bellman-Backups
*
*/
public int getNextAction() {
double mentorExpectation = Double.NEGATIVE_INFINITY;
double ownExpectation = Double.NEGATIVE_INFINITY;
double bestExpectation;
double bestActionValue;
int bestAction;
int s = world.getStateId(pos.getX(), pos.getY());
actionsTaken++;
//Decrement EPSILON over every action taken
if(EPSILON > 0.0){
EPSILON-=0.001;
}
if (world.QLEARNING){
//Epsilon Greedy Policy, for Q-Learning
if (r.nextDouble() < EPSILON) {
return r.nextInt(actions.length);
} else {
//Do the best action, over Q(s, a)
bestActionValue = Double.NEGATIVE_INFINITY;
bestAction = 0;
for (int a = 0; a < actions.length; a++) {
if (qMap[s][a] > bestActionValue) {
bestActionValue = qMap[s][a];
bestAction = a;
}
}
//If the Piece is an observer, the Mentor's Unknown action might be best.
if(isObserver()){
int KLDa = kldMinimizer(s);
if (qMap[s][MENTOR_ACTION_INDEX] > qMap[s][bestAction] && KLDa != -1 && actionsTaken > 20){
bestAction = KLDa;
}
}
}
return bestAction;
} else {
//System.out.println("In State " + getPosition());
//Augmented Bellmann Backup (Epsilon Greedy Policy)
if (r.nextDouble() < EPSILON) {
return r.nextInt(actions.length);
} else {
//First find the best action to take, given own model
bestAction = 0;
ownExpectation = Double.NEGATIVE_INFINITY;
for (int a = 0; a < actions.length; a++){
double sum = 0.0;
for (int t = 0; t < world.numStates(); t++){
sum += transitionProb(s, a, t)*vMap[t];
}
//System.out.println("Action (" + actions[a].dx() + ", " +actions[a].dy() + ") has expectation: " + sum);
if (sum > ownExpectation){
ownExpectation = sum;
bestAction = a;
}
}
bestExpectation = ownExpectation;
boolean mentorTold = false;
if(isObserver){
//Use the mentors information
mentorExpectation = 0.0;
for (int t = 0; t < world.numStates(); t++){
mentorExpectation += mentor.transitionProb(s, t)*vMap[t];
}
//Find own action that minimizes the kL-Distance:
int kld = kldMinimizer(s);
if (mentorExpectation > bestExpectation && kld != -1){
bestExpectation = mentorExpectation;
bestAction = kld;
mentorTold = true;
}
}
updateVFunction(s,bestAction, bestExpectation);
//printStateValues();
//System.out.println("In State " + getPosition() + " want to take (" + actions[bestAction].dx() + ", " +actions[bestAction].dy() + ")");
if(mentorTold){
//System.out.println("Because mentor told me");
}
return bestAction;
}
}
}
public static double klDistance (double[] vect1, double [] vect2){
double sum = 0.0;
for(int t = 0; t < vect1.length; t++){
double p1 = vect1[t];
double p2 = vect2[t];
System.out.print("( " +p1+ ", " +p2+ ") ,");
//sum += (0.5)*(p1* Math.log(p1/p2) + p2* Math.log(p2/p1));
if (p2 == 0){
p2 = 0.00001;
}
sum+= -p1*Math.log(p2);
System.out.println("Sum so far:" + sum);
}
return sum;
}
public static void main (String [] args){
double [] v1 = {0, 0, 0, .3, .7};
double [] v2 = {0, 0, 0, .3, .7};
System.out.println("KLDistance: " + klDistance(v1, v2));
double [] v3 = {.3, .7, 0, 0, 0};
double [] v4 = {0, 0, 0, .3, .7};
System.out.println("KLDistance: " + klDistance(v3, v4));
double [] v5 = {.3, .7, 0, 0, 0};
double [] v6 = {0, .7, 0, .3, 0};
System.out.println("KLDistance: " + klDistance(v5, v6));
}
private double klDistance (int s, int a){
double sum = 0.0;
for(int t = 0; t < world.numStates(); t++){
double p1 = transitionProb(s, a, t);
double p2 = mentor.transitionProb(s, t);
//System.out.print("( " +p1+ ", " +p2+ ") ,");
//sum += (0.5)*(p1* Math.log(p1/p2) + p2* Math.log(p2/p1));
if (p2 == 0){
p2 = 0.00001;
}
sum+= -p1*Math.log(p2);
}
//System.out.println("Sum:" + sum);
return sum;
}
private int kldMinimizer(int s) {
double smallestKLDistance = Double.POSITIVE_INFINITY;
int closestAction = 0;
//ArrayList<Integer> bestBunch = new ArrayList<Integer>();
for (int a = 0; a < actions.length; a++){
double thisDistance = klDistance(s, a);
if (thisDistance < smallestKLDistance){
smallestKLDistance = thisDistance;
closestAction = a;
}
}
//System.out.println("best: " + smallestKLDistance);
return closestAction;
}
public void updateTally(int s, int a, int t) {
tally[s][t]++;
actionTally[s][a][t]++;
}
public void updateVFunction(int s, int a, double bestExpectation) {
double reward = world.getReward(s, a);
//System.out.println("Updating vmap with bestExp: " + bestExpectation);
vMap[s] = (ALPHA)*vMap[s] + (1-ALPHA)*(reward + GAMMA*bestExpectation);
}
private double perceivedStateValue(int s){
double output = 0.0;
if(world.QLEARNING){
for (int a = 0; a < actions.length; a++){
output+= qMap[s][a];
}
} else {
output = vMap[s];
}
return output;
}
public void printStateValues(){
for (int i = 0; i < world.sizeX(); i++){
for(int j = 0; j < world.sizeY(); j++){
System.out.print(perceivedStateValue(world.getStateId(i, j)) + ", ");
}
System.out.println("");
}
}
public void printQMap(int s){
System.out.println("Going from: " + world.getStateCoords(s).getX() + ", " + world.getStateCoords(s).getY() + ")");
int besta = 0;
for (int a = 0; a < this.actions.length; a++){
if (qMap[s][a] > qMap[s][besta]){
besta = a;
}
}
System.out.println("Taking Action: " + "(" + actions[besta].dx() + ", " + actions[besta].dy() + ")" + " has value: " + qMap[s][besta]);
}
public void updateQFunction(int s, int a, int t) {
double reward = world.getReward(s, a);
// end the episode
if (world.isTerminalState(s)) {
qMap[s][a] = (ALPHA)*qMap[s][a] + (1-ALPHA)*(reward);
visibleQMap[s] = (ALPHA)*visibleQMap[s]+ (1-ALPHA)*(reward);
}else {
//Maximizing next action:
double bestNextActionValue = Double.NEGATIVE_INFINITY;
for (int a_Prime = 0; a_Prime < actions.length; a_Prime++){
if (qMap[t][a_Prime] > bestNextActionValue){
bestNextActionValue = qMap[t][a_Prime];
}
}
qMap[s][a] = (ALPHA)*qMap[s][a] + (1-ALPHA)*(reward + GAMMA*bestNextActionValue);
visibleQMap[s] = (ALPHA)*visibleQMap[s] + (1-ALPHA)*(reward + GAMMA*bestNextActionValue);
}
}
}