/* Copyright (C) 2008 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.fst.semi_supervised; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import cc.mallet.types.Alphabet; /** * Maps states in the lattice to labels. <p> * * When a custom state machine is constructed while training a CRF, it is * possible that several states map to the same label. In this case, there will * be a discrepancy between the number of states used in the lattice and the * number of output labels (targets). Use this mapping if such an FST is used in * training a CRF model. <p> * * If the number of states in the lattice is expected to be equal to the number * of output labels, then set <tt>isOneToOneMap</tt> to <tt>true</tt> in the * constructor. <p> * * This map associates the state with the appropriate label (indexing is zero * onwards). <p> * * <b>Note:</b> Add the states to the map in the same order in which they are * added to the CRF while constructing the FST. This is necessary to keep a * correct mapping of the state indices in this map to the state indices used * within the CRF. * * @author Gaurav Chandalia */ public class StateLabelMap { public final static int START_LABEL = -2; // mapping labels to integers private Alphabet stateAlphabet; // mapping state names to integers private Alphabet labelAlphabet; // true if a standard FST is used (using one of the methods provided in CRF // class), in this case the state and label alphabets are the same private boolean isOneToOneMap; // key: index identifying a state // value: index identifying a label that the state maps to in the state // machine private HashMap<Integer, Integer> stateToLabel; // key: index identifying a label // value: indices of states that are associated with the label private HashMap<Integer, LinkedHashSet<Integer>> labelToState; public StateLabelMap(Alphabet labelAlphabet, boolean isOneToOneMap) { this(labelAlphabet,isOneToOneMap,-1); } /** * Initializes the state and label maps. * * <b>Note:</b> If a standard FST is used (using one of the methods * provided in CRF class), the state and label alphabets are the same. In this * case, there will be a one-to-one mapping between the states and labels. * Also, the <tt>addStates</tt> method can no longer be used. This is done * when <tt>isOneToOneMap</tt> is <tt>true</tt>. * * @param labelAlphabet Target alphabet that maps label names to integers. * @param isOneToOneMap True if a one to one mapping of states and labels * is to be created (ignoring the start label) * @param startStateIndex Index of special START state or -1 */ public StateLabelMap(Alphabet labelAlphabet, boolean isOneToOneMap, int startStateIndex) { this.labelAlphabet = labelAlphabet; this.isOneToOneMap = isOneToOneMap; stateToLabel = new HashMap<Integer, Integer>(); labelToState = new HashMap<Integer, LinkedHashSet<Integer>>(); Iterator<?> labelIter = null; if (isOneToOneMap) { // use the same alphabet for state and label stateAlphabet = labelAlphabet; labelIter = labelAlphabet.iterator(); while (labelIter.hasNext()) { String label = (String) labelIter.next(); int labelIndex = labelAlphabet.lookupIndex(label, false); stateToLabel.put(labelIndex, labelIndex); LinkedHashSet<Integer> stateIndices = new LinkedHashSet<Integer>(); stateIndices.add(labelIndex); labelToState.put(labelIndex, stateIndices); } } else { stateAlphabet = new Alphabet(); labelIter = labelAlphabet.iterator(); while (labelIter.hasNext()) { String label = (String) labelIter.next(); labelToState.put(labelAlphabet.lookupIndex(label, false), new LinkedHashSet<Integer>()); } } if (startStateIndex != -1) { addStartState(startStateIndex); } } /** * If there is a special start state in the CRF * that is not included in the label set, then * we need to add it here. Constraints can then * check if a state maps to the special START_LABEL, * and handle this appropriately. * * @param index Index of the special start state in the CRF. */ public void addStartState(int index) { this.stateToLabel.put(index, START_LABEL); } /** * Returns <tt>true</tt> if there is a one-to-one mapping between the states * and labels and <tt>false</tt> otherwise. */ public boolean isOneToOneMapping() { return isOneToOneMap; } /** * Returns the number of labels in the map. */ public int getNumLabels() { return labelToState.size(); } /** * Returns the number of states in the map. */ public int getNumStates() { return stateToLabel.size(); } /** * Returns the label (target) alphabet. */ public Alphabet getLabelAlphabet() { return labelAlphabet; } /** * Returns the state alphabet. */ public Alphabet getStateAlphabet() { return stateAlphabet; } /** * Returns the label index mapped to the state index. * * @param stateIndex State index. * @return Index of the label that is mapped to the state. Returns <tt>-1</tt> * if there is no label (index) that maps to the specified state. */ public int getLabelIndex(int stateIndex) { // since no null values are allowed in our map, directly use the get method Integer labelIndex = stateToLabel.get(stateIndex); if (labelIndex == null) { return -1; } return labelIndex; } /** * Returns the state indices that map to the label index. * * @param labelIndex Label (target) index. * @return Indices of the states that map to the label. Returns <tt>null</tt> * if there are no states that map to the label. */ public LinkedHashSet<Integer> getStateIndices(int labelIndex) { return labelToState.get(labelIndex); } /** * Adds a state to the map. * * @param stateName Name of the state. * @param labelName Label (target) name with which the state is associated. * @return The index associated with the state that was added. * @throws IllegalArgumentException If an invalid label name or a duplicate * state name is provided. * @throws IllegalStateError If this method is called when there is a * one-to-one mapping between the states and labels. */ public int addState(String stateName, String labelName) { if (isOneToOneMap) throw new IllegalStateException("Trying to add a state when there is a " + "one to one mapping between the states " + "and labels."); // get the label index int labelIndex = labelAlphabet.lookupIndex(labelName, false); if (labelIndex == -1) { throw new IllegalArgumentException("Invalid label: " + labelName); } // add the state and get its index int stateIndex = stateAlphabet.lookupIndex(stateName, false); if (stateIndex != -1) { throw new IllegalArgumentException("Duplicate state: " + stateName); } stateIndex = stateAlphabet.lookupIndex(stateName, true); // add the indices to the label-state and state-label maps try { labelToState.get(labelIndex).add(stateIndex); } catch (NullPointerException npe) { // It is possible that a label is never seen in the training data. In that // case the true number of labels will not be equal to the size of the // label (target) alphabet until the state with the unseen label is added // to the label alphabet while constructing the FST, add such a label // here. LinkedHashSet<Integer> stateIndices = new LinkedHashSet<Integer>(); stateIndices.add(stateIndex); labelToState.put(labelIndex, stateIndices); } stateToLabel.put(stateIndex, labelIndex); return stateIndex; } }