package edu.fudan.nlp.parser.dep; import java.util.*; import edu.fudan.ml.types.alphabet.AlphabetFactory; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; import edu.fudan.ml.types.sv.HashSparseVector; import edu.fudan.nlp.parser.Sentence; /** * 句法分析过程中的状态,及在此状态上的一系列操作 * * 句法分析由状态的转换完成,转换操作涉及在当前状态提取特征,动作执行。 动作的预测在Parser 中完成 * * @author cshen * @version Feb 16, 2009 * @see YamadaParser */ public class ParsingState { /** * 动作类型 */ public enum Action { SHIFT, LEFT, RIGHT } protected static final int DefaultLabelNum = 25; protected Sentence instance; protected List<DependencyTree> trees; protected int leftFocus; // 非SHIFT动作中概率较大的动作的概率 protected float[] probsOfBuild; // 非SHIFT动作中概率较大的动作 protected Action[] actionsOfBuild; // 是否执行过非SHIFT动作 protected boolean isUpdated = false; protected boolean isFinal = false; protected AlphabetFactory factory; /** * 构造函数 * * 由句子实例初始化状态 * * @param instance * 句子实例 */ public ParsingState(Sentence instance,AlphabetFactory factory2) { factory = factory2; trees = new ArrayList<DependencyTree>(); for (int i = 0; i < instance.length(); i++) { String word = instance.getWordAt(i); String pos = instance.getTagAt(i); DependencyTree tree = new DependencyTree(i, word, pos); trees.add(tree); } this.instance = instance; probsOfBuild = new float[trees.size() - 1]; actionsOfBuild = new Action[trees.size() - 1]; } public boolean isFinalState() { return trees.size() == 1 || isFinal; } public int[] getFocusIndices() { assert (!isFinalState()); int[] indices = new int[2]; indices[0] = trees.get(leftFocus).id; indices[1] = trees.get(leftFocus + 1).id; return indices; } public String getLeftPos() { return trees.get(leftFocus).pos; } /** * 得到当前状态的特征 * * @return 特征表,其中key是有用的,value没有用 * @throws Exception */ public HashSparseVector getFeatures() { if (isFinalState()) return null; IFeatureAlphabet features = factory.DefaultFeatureAlphabet(); int rightFocus = leftFocus + 1; HashSparseVector vec = new HashSparseVector(); // 设定上下文窗口大小 int l = 2; int r = 2; for (int i = 0; i <= l; i++) { // 词性特征前缀 String posFeature = "-" + new Integer(i).toString() + "/pos/"; String lexFeature = "-" + new Integer(i).toString() + "/lex/"; String lcLexFeature = "-" + new Integer(i).toString() + "/ch-L-lex/"; String lcPosFeature = "-" + new Integer(i).toString() + "/ch-L-pos/"; String rcLexFeature = "-" + new Integer(i).toString() + "/ch-R-lex/"; String rcPosFeature = "-" + new Integer(i).toString() + "/ch-R-pos/"; if (leftFocus - i < 0) { addFeature(features, vec, lexFeature + "START" + String.valueOf(i - leftFocus)); addFeature(features, vec, posFeature + "START" + String.valueOf(i - leftFocus)); } else { addFeature( features, vec, lexFeature + instance.words[trees.get(leftFocus - i).id]); addFeature(features, vec, posFeature + instance.tags[trees.get(leftFocus - i).id]); if (trees.get(leftFocus - i).leftChilds.size() != 0) { for (int j = 0; j < trees.get(leftFocus - i).leftChilds .size(); j++) { int leftChildIndex = trees.get(leftFocus - i).leftChilds .get(j).id; addFeature(features, vec, lcLexFeature + instance.words[leftChildIndex]); addFeature(features, vec, lcPosFeature + instance.tags[leftChildIndex]); } } if (trees.get(leftFocus - i).rightChilds.size() != 0) { for (int j = 0; j < trees.get(leftFocus - i).rightChilds .size(); j++) { int rightChildIndex = trees.get(leftFocus - i).rightChilds .get(j).id; addFeature(features, vec, rcLexFeature + instance.words[rightChildIndex]); addFeature(features, vec, rcPosFeature + instance.tags[rightChildIndex]); } } } } for (int i = 0; i <= r; i++) { String posFeature = "+" + new Integer(i).toString() + "/pos/"; String lexFeature = "+" + new Integer(i).toString() + "/lex/"; String lcLexFeature = "+" + new Integer(i).toString() + "/ch-L-lex/"; String lcPosFeature = "+" + new Integer(i).toString() + "/ch-L-pos/"; String rcLexFeature = "+" + new Integer(i).toString() + "/ch-R-lex/"; String rcPosFeature = "+" + new Integer(i).toString() + "/ch-R-pos/"; if (rightFocus + i >= trees.size()) { addFeature( features, vec, lexFeature + "END" + String.valueOf(rightFocus + i - trees.size() + 3)); addFeature( features, vec, posFeature + "END" + String.valueOf(rightFocus + i - trees.size() + 3)); } else { addFeature( features, vec, lexFeature + instance.words[trees.get(rightFocus + i).id]); addFeature(features, vec, posFeature + instance.tags[trees.get(rightFocus + i).id]); if (trees.get(rightFocus + i).leftChilds.size() != 0) { for (int j = 0; j < trees.get(rightFocus + i).leftChilds .size(); j++) { int leftChildIndex = trees.get(rightFocus + i).leftChilds .get(j).id; addFeature(features, vec, lcLexFeature + instance.words[leftChildIndex]); addFeature(features, vec, lcPosFeature + instance.tags[leftChildIndex]); } } if (trees.get(rightFocus + i).rightChilds.size() != 0) { for (int j = 0; j < trees.get(rightFocus + i).rightChilds .size(); j++) { int rightChildIndex = trees.get(rightFocus + i).rightChilds .get(j).id; addFeature(features, vec, rcLexFeature + instance.words[rightChildIndex]); addFeature(features, vec, rcPosFeature + instance.tags[rightChildIndex]); } } } } return vec; } protected void addFeature(IFeatureAlphabet features, HashSparseVector vec, String str) { int idx = features.lookupIndex(str, DefaultLabelNum); if (idx != -1) { vec.put(idx, 1); } } /** * 状态转换,动作为SHIFT * * 动作为SHIFT,但保存第二大可能的动作,当一列动作都是SHIFT时,执行概率最大的第二大动作 * * @param action * 第二大可能的动作 * @param prob * 第二大可能的动作的概率 */ public void next(Action action, float prob) { probsOfBuild[leftFocus] = prob; actionsOfBuild[leftFocus] = action; leftFocus++; if (leftFocus >= trees.size() - 1) { if (!isUpdated) { int maxIndex = 0; float maxValue = 0; for (int i = 0; i < probsOfBuild.length; i++) if (probsOfBuild[i] > maxValue) { maxValue = probsOfBuild[i]; maxIndex = i; } leftFocus = maxIndex; next(actionsOfBuild[leftFocus]); } back(); } } /** * 将序列第一二个词设为焦点词 */ protected void back() { isUpdated = false; leftFocus = 0; probsOfBuild = new float[trees.size() - 1]; actionsOfBuild = new Action[trees.size() - 1]; } /** * 状态转换, 执行动作 * * @param action * 要执行的动作 */ public void next(Action action) { // assert (action.equalsIgnoreCase("left") // || action.equalsIgnoreCase("right") || action // .equalsIgnoreCase("shift")); assert (!isFinalState()); // 左焦点词在句子中的位置 int lNode = trees.get(leftFocus).id; int rNode = trees.get(leftFocus + 1).id; switch (action) { case LEFT: // add for counting two types of errors // if (instance.heads[rNode] == lNode) // break; // end trees.get(leftFocus).addRightChild(trees.get(leftFocus + 1)); trees.remove(leftFocus + 1); isUpdated = true; break; case RIGHT: // add for counting two types of errors // if (instance.heads[lNode] == rNode) // break; // end trees.get(leftFocus + 1).addLeftChild(trees.get(leftFocus)); trees.remove(leftFocus); isUpdated = true; break; default: leftFocus++; } if (leftFocus >= trees.size() - 1) { if (!isUpdated) { isFinal = true; } back(); } } }