package edu.fudan.nlp.parser.dep;
import java.util.*;
import edu.fudan.ml.types.alphabet.AlphabetFactory;
import edu.fudan.ml.types.alphabet.IFeatureAlphabet;
import edu.fudan.ml.types.sv.HashSparseVector;
import edu.fudan.nlp.parser.Sentence;
/**
* 句法分析过程中的状态,及在此状态上的一系列操作
*
* 句法分析由状态的转换完成,转换操作涉及在当前状态提取特征,动作执行。 动作的预测在Parser 中完成
*
* @author cshen
* @version Feb 16, 2009
* @see YamadaParser
*/
public class ParsingState {
/**
* 动作类型
*/
public enum Action {
SHIFT, LEFT, RIGHT
}
protected static final int DefaultLabelNum = 25;
protected Sentence instance;
protected List<DependencyTree> trees;
protected int leftFocus;
// 非SHIFT动作中概率较大的动作的概率
protected float[] probsOfBuild;
// 非SHIFT动作中概率较大的动作
protected Action[] actionsOfBuild;
// 是否执行过非SHIFT动作
protected boolean isUpdated = false;
protected boolean isFinal = false;
protected AlphabetFactory factory;
/**
* 构造函数
*
* 由句子实例初始化状态
*
* @param instance
* 句子实例
*/
public ParsingState(Sentence instance,AlphabetFactory factory2) {
factory = factory2;
trees = new ArrayList<DependencyTree>();
for (int i = 0; i < instance.length(); i++) {
String word = instance.getWordAt(i);
String pos = instance.getTagAt(i);
DependencyTree tree = new DependencyTree(i, word, pos);
trees.add(tree);
}
this.instance = instance;
probsOfBuild = new float[trees.size() - 1];
actionsOfBuild = new Action[trees.size() - 1];
}
public boolean isFinalState() {
return trees.size() == 1 || isFinal;
}
public int[] getFocusIndices() {
assert (!isFinalState());
int[] indices = new int[2];
indices[0] = trees.get(leftFocus).id;
indices[1] = trees.get(leftFocus + 1).id;
return indices;
}
public String getLeftPos() {
return trees.get(leftFocus).pos;
}
/**
* 得到当前状态的特征
*
* @return 特征表,其中key是有用的,value没有用
* @throws Exception
*/
public HashSparseVector getFeatures() {
if (isFinalState())
return null;
IFeatureAlphabet features = factory.DefaultFeatureAlphabet();
int rightFocus = leftFocus + 1;
HashSparseVector vec = new HashSparseVector();
// 设定上下文窗口大小
int l = 2;
int r = 2;
for (int i = 0; i <= l; i++) {
// 词性特征前缀
String posFeature = "-" + new Integer(i).toString() + "/pos/";
String lexFeature = "-" + new Integer(i).toString() + "/lex/";
String lcLexFeature = "-" + new Integer(i).toString()
+ "/ch-L-lex/";
String lcPosFeature = "-" + new Integer(i).toString()
+ "/ch-L-pos/";
String rcLexFeature = "-" + new Integer(i).toString()
+ "/ch-R-lex/";
String rcPosFeature = "-" + new Integer(i).toString()
+ "/ch-R-pos/";
if (leftFocus - i < 0) {
addFeature(features, vec,
lexFeature + "START" + String.valueOf(i - leftFocus));
addFeature(features, vec,
posFeature + "START" + String.valueOf(i - leftFocus));
} else {
addFeature(
features,
vec,
lexFeature
+ instance.words[trees.get(leftFocus - i).id]);
addFeature(features, vec, posFeature
+ instance.tags[trees.get(leftFocus - i).id]);
if (trees.get(leftFocus - i).leftChilds.size() != 0) {
for (int j = 0; j < trees.get(leftFocus - i).leftChilds
.size(); j++) {
int leftChildIndex = trees.get(leftFocus - i).leftChilds
.get(j).id;
addFeature(features, vec, lcLexFeature
+ instance.words[leftChildIndex]);
addFeature(features, vec, lcPosFeature
+ instance.tags[leftChildIndex]);
}
}
if (trees.get(leftFocus - i).rightChilds.size() != 0) {
for (int j = 0; j < trees.get(leftFocus - i).rightChilds
.size(); j++) {
int rightChildIndex = trees.get(leftFocus - i).rightChilds
.get(j).id;
addFeature(features, vec, rcLexFeature
+ instance.words[rightChildIndex]);
addFeature(features, vec, rcPosFeature
+ instance.tags[rightChildIndex]);
}
}
}
}
for (int i = 0; i <= r; i++) {
String posFeature = "+" + new Integer(i).toString() + "/pos/";
String lexFeature = "+" + new Integer(i).toString() + "/lex/";
String lcLexFeature = "+" + new Integer(i).toString()
+ "/ch-L-lex/";
String lcPosFeature = "+" + new Integer(i).toString()
+ "/ch-L-pos/";
String rcLexFeature = "+" + new Integer(i).toString()
+ "/ch-R-lex/";
String rcPosFeature = "+" + new Integer(i).toString()
+ "/ch-R-pos/";
if (rightFocus + i >= trees.size()) {
addFeature(
features,
vec,
lexFeature
+ "END"
+ String.valueOf(rightFocus + i
- trees.size() + 3));
addFeature(
features,
vec,
posFeature
+ "END"
+ String.valueOf(rightFocus + i
- trees.size() + 3));
} else {
addFeature(
features,
vec,
lexFeature
+ instance.words[trees.get(rightFocus + i).id]);
addFeature(features, vec, posFeature
+ instance.tags[trees.get(rightFocus + i).id]);
if (trees.get(rightFocus + i).leftChilds.size() != 0) {
for (int j = 0; j < trees.get(rightFocus + i).leftChilds
.size(); j++) {
int leftChildIndex = trees.get(rightFocus + i).leftChilds
.get(j).id;
addFeature(features, vec, lcLexFeature
+ instance.words[leftChildIndex]);
addFeature(features, vec, lcPosFeature
+ instance.tags[leftChildIndex]);
}
}
if (trees.get(rightFocus + i).rightChilds.size() != 0) {
for (int j = 0; j < trees.get(rightFocus + i).rightChilds
.size(); j++) {
int rightChildIndex = trees.get(rightFocus + i).rightChilds
.get(j).id;
addFeature(features, vec, rcLexFeature
+ instance.words[rightChildIndex]);
addFeature(features, vec, rcPosFeature
+ instance.tags[rightChildIndex]);
}
}
}
}
return vec;
}
protected void addFeature(IFeatureAlphabet features, HashSparseVector vec,
String str) {
int idx = features.lookupIndex(str, DefaultLabelNum);
if (idx != -1) {
vec.put(idx, 1);
}
}
/**
* 状态转换,动作为SHIFT
*
* 动作为SHIFT,但保存第二大可能的动作,当一列动作都是SHIFT时,执行概率最大的第二大动作
*
* @param action
* 第二大可能的动作
* @param prob
* 第二大可能的动作的概率
*/
public void next(Action action, float prob) {
probsOfBuild[leftFocus] = prob;
actionsOfBuild[leftFocus] = action;
leftFocus++;
if (leftFocus >= trees.size() - 1) {
if (!isUpdated) {
int maxIndex = 0;
float maxValue = 0;
for (int i = 0; i < probsOfBuild.length; i++)
if (probsOfBuild[i] > maxValue) {
maxValue = probsOfBuild[i];
maxIndex = i;
}
leftFocus = maxIndex;
next(actionsOfBuild[leftFocus]);
}
back();
}
}
/**
* 将序列第一二个词设为焦点词
*/
protected void back() {
isUpdated = false;
leftFocus = 0;
probsOfBuild = new float[trees.size() - 1];
actionsOfBuild = new Action[trees.size() - 1];
}
/**
* 状态转换, 执行动作
*
* @param action
* 要执行的动作
*/
public void next(Action action) {
// assert (action.equalsIgnoreCase("left")
// || action.equalsIgnoreCase("right") || action
// .equalsIgnoreCase("shift"));
assert (!isFinalState());
// 左焦点词在句子中的位置
int lNode = trees.get(leftFocus).id;
int rNode = trees.get(leftFocus + 1).id;
switch (action) {
case LEFT:
// add for counting two types of errors
// if (instance.heads[rNode] == lNode)
// break;
// end
trees.get(leftFocus).addRightChild(trees.get(leftFocus + 1));
trees.remove(leftFocus + 1);
isUpdated = true;
break;
case RIGHT:
// add for counting two types of errors
// if (instance.heads[lNode] == rNode)
// break;
// end
trees.get(leftFocus + 1).addLeftChild(trees.get(leftFocus));
trees.remove(leftFocus);
isUpdated = true;
break;
default:
leftFocus++;
}
if (leftFocus >= trees.size() - 1) {
if (!isUpdated) {
isFinal = true;
}
back();
}
}
}