package edu.fudan.nlp.parser.dep; import java.util.ArrayList; import java.util.List; import edu.fudan.ml.types.alphabet.AlphabetFactory; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; import edu.fudan.ml.types.sv.HashSparseVector; import edu.fudan.ml.types.sv.ISparseVector; import edu.fudan.nlp.parser.Sentence; import edu.fudan.nlp.parser.dep.ParsingState.Action; import gnu.trove.list.array.TIntArrayList; /** * 句法分析过程中的状态,及在此状态上的一系列操作 * * 句法分析由状态的转换完成,转换操作涉及在当前状态提取特征,动作执行。 动作的预测在Parser 中完成 * * @author xpqiu */ public class JointParsingState{ private static final String END = "E*"; private static final String START = "S*"; private static final String CH_L_LEX = "/LL/"; private static final String CH_R_LEX = "/RL/"; private static final String NULL = "N*"; private static final String CH_R_POS = "/RP/"; private static final String CH_L_POS = "/LP/"; private static final String LEX = "/L/"; private static final String POS = "/P/"; private int ysize = 0; /** * 动作类型 * @author xpqiu * */ public enum Action { SHIFT, LEFT, RIGHT } protected Sentence sent; protected List<DependencyTree> trees; protected int leftFocus; // 非SHIFT动作中概率较大的动作的概率 protected float[] probsOfBuild; // 非SHIFT动作中概率较大的动作 protected Action[] actionsOfBuild; // 是否执行过非SHIFT动作 protected boolean isUpdated = false; protected boolean isFinal = false; private String[] depClassOfBuild; /** * 构造函数 * * 由句子实例初始化状态 * * @param instance * 句子实例 * @param factory2 */ public JointParsingState(Sentence instance) { trees = new ArrayList<DependencyTree>(); for (int i = 0; i < instance.length(); i++) { String word = instance.getWordAt(i); String pos = instance.getTagAt(i); DependencyTree tree = new DependencyTree(i, word, pos); trees.add(tree); } this.sent = instance; if(trees.size()==0) return; probsOfBuild = new float[trees.size() - 1]; actionsOfBuild = new Action[trees.size() - 1]; depClassOfBuild = new String[trees.size()-1]; } /** * 得到当前状态的特征 * * @return 特征表 * @throws Exception */ public ArrayList<String> getFeatures() { if (isFinalState()) return null; ArrayList<String> featurelist = new ArrayList<String>(); int rightFocus = leftFocus + 1; // ISparseVector vec = new HashSparseVector(); StringBuilder posFeature1 = new StringBuilder(); posFeature1.append("+-2").append(POS).append(trees.get(leftFocus).pos) .append("/").append(trees.get(rightFocus).pos); featurelist.add(posFeature1.toString()); StringBuilder lexFeature1 = new StringBuilder(); lexFeature1.append("+-2").append(LEX).append(trees.get(leftFocus).word) .append("/").append(trees.get(rightFocus).word); featurelist.add(lexFeature1.toString()); // 设定上下文窗口大小 int l = 2; int r = 2; for (int i = 0; i <= l; i++) { // 特征前缀 String posFeature = "-" + String.valueOf(i) + POS; String lexFeature = "-" + String.valueOf(i) + LEX; String lcLexFeature = "-" + String.valueOf(i) + CH_L_LEX; String lcPosFeature = "-" + String.valueOf(i) + CH_L_POS; String rcLexFeature = "-" + String.valueOf(i) + CH_R_LEX; String rcPosFeature = "-" + String.valueOf(i) + CH_R_POS; if (leftFocus - i < 0) { featurelist.add(lexFeature + START + String.valueOf(i - leftFocus)); featurelist.add(posFeature + START + String.valueOf(i - leftFocus)); } else { featurelist.add(lexFeature + sent.words[trees.get(leftFocus - i).id]); featurelist.add(posFeature + sent.tags[trees.get(leftFocus - i).id]); if (trees.get(leftFocus - i).leftChilds.size() != 0) { for (int j = 0; j < trees.get(leftFocus - i).leftChilds .size(); j++) { int leftChildIndex = trees.get(leftFocus - i).leftChilds .get(j).id; featurelist.add(lcLexFeature + sent.words[leftChildIndex]); featurelist.add(lcPosFeature + sent.tags[leftChildIndex]); } }else{ featurelist.add(lcLexFeature + NULL); featurelist.add(lcPosFeature + NULL); } if (trees.get(leftFocus - i).rightChilds.size() != 0) { for (int j = 0; j < trees.get(leftFocus - i).rightChilds .size(); j++) { int rightChildIndex = trees.get(leftFocus - i).rightChilds .get(j).id; featurelist.add(rcLexFeature + sent.words[rightChildIndex]); featurelist.add(rcPosFeature + sent.tags[rightChildIndex]); } }else{ featurelist.add(rcLexFeature + NULL); featurelist.add(rcPosFeature + NULL); } } } for (int i = 0; i <= r; i++) { String posFeature = "+" + String.valueOf(i) + POS; String lexFeature = "+" + String.valueOf(i) + LEX; String lcLexFeature = "+" + String.valueOf(i) + CH_L_LEX; String lcPosFeature = "+" + String.valueOf(i) + CH_L_POS; String rcLexFeature = "+" + String.valueOf(i) + CH_R_LEX; String rcPosFeature = "+" + String.valueOf(i) + CH_R_POS; if (rightFocus + i >= trees.size()) { featurelist.add(lexFeature+ END+ String.valueOf(rightFocus + i- trees.size() + 3)); featurelist.add(posFeature+ END+ String.valueOf(rightFocus + i- trees.size() + 3)); } else { featurelist.add(lexFeature+ sent.words[trees.get(rightFocus + i).id]); featurelist.add(posFeature+ sent.tags[trees.get(rightFocus + i).id]); if (trees.get(rightFocus + i).leftChilds.size() != 0) { for (int j = 0; j < trees.get(rightFocus + i).leftChilds .size(); j++) { int leftChildIndex = trees.get(rightFocus + i).leftChilds .get(j).id; featurelist.add(lcLexFeature+ sent.words[leftChildIndex]); featurelist.add(lcPosFeature+ sent.tags[leftChildIndex]); } }else{ featurelist.add(lcLexFeature + NULL); featurelist.add(lcPosFeature + NULL); } if (trees.get(rightFocus + i).rightChilds.size() != 0) { for (int j = 0; j < trees.get(rightFocus + i).rightChilds .size(); j++) { int rightChildIndex = trees.get(rightFocus + i).rightChilds .get(j).id; featurelist.add(rcLexFeature+ sent.words[rightChildIndex]); featurelist.add(rcPosFeature+ sent.tags[rightChildIndex]); } }else{ featurelist.add(rcLexFeature + NULL); featurelist.add(rcPosFeature + NULL); } } } return featurelist; } public boolean isFinalState() { return trees.size()==0||trees.size() == 1 || isFinal; } /** * 状态转换,动作为SHIFT * * 动作为SHIFT,但保存第二大可能的动作,当一列动作都是SHIFT时,执行概率最大的第二大动作 * * @param action * 第二大可能的动作 * @param prob * 第二大可能的动作的概率 */ public void next(Action action, float prob,String depClass) { probsOfBuild[leftFocus] = prob; actionsOfBuild[leftFocus] = action; depClassOfBuild[leftFocus] = depClass; leftFocus++; if (leftFocus >= trees.size() - 1) { if (!isUpdated) { int maxIndex = 0; float maxValue = Float.NEGATIVE_INFINITY; for (int i = 0; i < probsOfBuild.length; i++) if (probsOfBuild[i] > maxValue) { maxValue = probsOfBuild[i]; maxIndex = i; } leftFocus = maxIndex; next(actionsOfBuild[leftFocus],depClassOfBuild[leftFocus]); } back(); } } /** * 状态转换, 执行动作 * * @param action * 要执行的动作 */ public void next(Action action,String depClass) { assert (!isFinalState()); // 左焦点词在句子中的位置 int lNode = trees.get(leftFocus).id; int rNode = trees.get(leftFocus + 1).id; switch (action) { case LEFT: trees.get(leftFocus + 1).setDepClass(depClass); trees.get(leftFocus).addRightChild(trees.get(leftFocus + 1)); trees.remove(leftFocus + 1); isUpdated = true; break; case RIGHT: trees.get(leftFocus).setDepClass(depClass); trees.get(leftFocus + 1).addLeftChild(trees.get(leftFocus)); trees.remove(leftFocus); isUpdated = true; break; default: leftFocus++; } if (leftFocus >= trees.size() - 1) { if (!isUpdated) { isFinal = true; } back(); } } public int[] getFocusIndices() { assert (!isFinalState()); int[] indices = new int[2]; indices[0] = trees.get(leftFocus).id; indices[1] = trees.get(leftFocus + 1).id; return indices; } /** * 将序列第一二个词设为焦点词 */ protected void back() { isUpdated = false; leftFocus = 0; probsOfBuild = new float[trees.size() - 1]; actionsOfBuild = new Action[trees.size() - 1]; depClassOfBuild = new String[trees.size() - 1]; } }