package edu.stanford.nlp.ie; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ie.ner.CMMClassifier; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.ErasureUtils; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.StringUtils; import java.io.FileNotFoundException; import java.io.ObjectInputStream; import java.io.IOException; import java.util.*; /** * Merges the outputs of two or more AbstractSequenceClassifiers according to * a simple precedence scheme: any given base classifier contributes only * classifications of labels that do not exist in the base classifiers specified * before, and that do not have any token overlap with labels assigned by * higher priority classifiers. * <p> * This is a pure AbstractSequenceClassifier, i.e., it sets the AnswerAnnotation label. * If you work with NER classifiers, you should use NERClassifierCombiner. This class * inherits from ClassifierCombiner, and takes care that all AnswerAnnotations are also * copied to NERAnnotation. * <p> * You can specify up to 10 base classifiers using the -loadClassifier1 to -loadClassifier10 * properties. We also maintain the older usage when only two base classifiers were accepted, * specified using -loadClassifier and -loadAuxClassifier. * <p> * ms 2009: removed all NER functionality (see NERClassifierCombiner), changed code so it accepts an arbitrary number of base classifiers, removed dead code. * * @author Chris Cox * @author Mihai Surdeanu */ public class ClassifierCombiner<IN extends CoreMap & HasWord> extends AbstractSequenceClassifier<IN> { private static final boolean DEBUG = false; private List<AbstractSequenceClassifier<IN>> baseClassifiers; private static final String DEFAULT_AUX_CLASSIFIER_PATH="/u/nlp/data/ner/goodClassifiers/english.muc.7class.distsim.crf.ser.gz"; private static final String DEFAULT_CLASSIFIER_PATH="/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz"; /** * NORMAL means that if one classifier uses PERSON, later classifiers can't also add PERSON, for example. <br> * HIGH_RECALL allows later models to set PERSON as long as it doesn't clobber existing annotations. */ static enum CombinationMode { NORMAL, HIGH_RECALL } static final CombinationMode DEFAULT_COMBINATION_MODE = CombinationMode.NORMAL; static final String COMBINATION_MODE_PROPERTY = "ner.combinationMode"; final CombinationMode combinationMode; /** * @param p Properties File that specifies <code>loadClassifier</code> * and <code>loadAuxClassifier</code> properties or, alternatively, <code>loadClassifier[1-10]</code> properties. * @throws FileNotFoundException If classifier files not found */ public ClassifierCombiner(Properties p) throws FileNotFoundException { super(p); this.combinationMode = extractCombinationModeSafe(p); String loadPath1, loadPath2; List<String> paths = new ArrayList<String>(); // // preferred configuration: specify up to 10 base classifiers using loadClassifier1 to loadClassifier10 properties // if((loadPath1 = p.getProperty("loadClassifier1")) != null && (loadPath2 = p.getProperty("loadClassifier2")) != null) { paths.add(loadPath1); paths.add(loadPath2); for(int i = 3; i <= 10; i ++){ String path; if ((path = p.getProperty("loadClassifier" + i)) != null) { paths.add(path); } } loadClassifiers(paths); } // // second accepted setup (backward compatible): two classifier given in loadClassifier and loadAuxClassifier // else if((loadPath1 = p.getProperty("loadClassifier")) != null && (loadPath2 = p.getProperty("loadAuxClassifier")) != null){ paths.add(loadPath1); paths.add(loadPath2); loadClassifiers(paths); } // // fall back strategy: use the two default paths on NLP machines // else { paths.add(DEFAULT_CLASSIFIER_PATH); paths.add(DEFAULT_AUX_CLASSIFIER_PATH); loadClassifiers(paths); } } /** Loads a series of base classifiers from the paths specified. * * @param loadPaths Paths to the base classifiers * @throws FileNotFoundException If classifier files not found */ public ClassifierCombiner(CombinationMode combinationMode, String... loadPaths) throws FileNotFoundException { super(new Properties()); this.combinationMode = combinationMode; List<String> paths = new ArrayList<String>(Arrays.asList(loadPaths)); loadClassifiers(paths); } /** Loads a series of base classifiers from the paths specified. * * @param loadPaths Paths to the base classifiers * @throws FileNotFoundException If classifier files not found */ public ClassifierCombiner(String... loadPaths) throws FileNotFoundException { super(new Properties()); this.combinationMode = DEFAULT_COMBINATION_MODE; List<String> paths = new ArrayList<String>(Arrays.asList(loadPaths)); loadClassifiers(paths); } /** Combines a series of base classifiers * * @param classifiers The base classifiers */ public ClassifierCombiner(AbstractSequenceClassifier<IN>... classifiers) { super(new Properties()); this.combinationMode = DEFAULT_COMBINATION_MODE; baseClassifiers = new ArrayList<AbstractSequenceClassifier<IN>>(Arrays.asList(classifiers)); flags.backgroundSymbol = baseClassifiers.get(0).flags.backgroundSymbol; } /** * Either finds COMBINATION_MODE_PROPERTY or returns a default value */ public static CombinationMode extractCombinationMode(Properties p) { String mode = p.getProperty(COMBINATION_MODE_PROPERTY); if (mode == null) { return DEFAULT_COMBINATION_MODE; } else { return CombinationMode.valueOf(mode.toUpperCase()); } } /** * Either finds COMBINATION_MODE_PROPERTY or returns a default * value. If the value is not a legal value, a warning is printed. */ public static CombinationMode extractCombinationModeSafe(Properties p) { try { return extractCombinationMode(p); } catch (IllegalArgumentException e) { System.err.print("Illegal value of " + COMBINATION_MODE_PROPERTY + ": " + p.getProperty(COMBINATION_MODE_PROPERTY)); System.err.print(" Legal values:"); for (CombinationMode mode : CombinationMode.values()) { System.err.print(" " + mode); } System.err.println(); return CombinationMode.NORMAL; } } private void loadClassifiers(List<String> paths) throws FileNotFoundException { baseClassifiers = new ArrayList<AbstractSequenceClassifier<IN>>(); for(String path: paths){ AbstractSequenceClassifier<IN> cls = loadClassifierFromPath(path); baseClassifiers.add(cls); if(DEBUG){ System.err.printf("Successfully loaded classifier #%d from %s.\n", baseClassifiers.size(), path); } } if (baseClassifiers.size() > 0) { flags.backgroundSymbol = baseClassifiers.get(0).flags.backgroundSymbol; } } public static <INN extends CoreMap & HasWord> AbstractSequenceClassifier<INN> loadClassifierFromPath(String path) throws FileNotFoundException { //try loading as a CRFClassifier try { return ErasureUtils.uncheckedCast(CRFClassifier.getClassifier(path)); } catch (Exception e) { e.printStackTrace(); } //try loading as a CMMClassifier try { return ErasureUtils.uncheckedCast(CMMClassifier.getClassifier(path)); } catch (Exception e) { //fail //System.err.println("Couldn't load classifier from path :"+path); FileNotFoundException fnfe = new FileNotFoundException(); fnfe.initCause(e); throw fnfe; } } @Override public Set<String> labels() { Set<String> labs = Generics.newHashSet(); for(AbstractSequenceClassifier<? extends CoreMap> cls: baseClassifiers) labs.addAll(cls.labels()); return labs; } /** * Reads the Answer annotations in the given labellings (produced by the base models) * and combines them using a priority ordering, i.e., for a given baseDocument all * labellings seen before in the baseDocuments list have higher priority. * Writes the answer to AnswerAnnotation in the labeling at position 0 * (considered to be the main document). * * @param baseDocuments Results of all base AbstractSequenceClassifier models * @return A List of IN with the combined annotations. (This is an * updating of baseDocuments.get(0), not a new List.) */ private List<IN> mergeDocuments(List<List<IN>> baseDocuments){ // we should only get here if there is something to merge assert(! baseClassifiers.isEmpty() && ! baseDocuments.isEmpty()); // all base outputs MUST have the same length (we generated them internally!) for(int i = 1; i < baseDocuments.size(); i ++) assert(baseDocuments.get(0).size() == baseDocuments.get(i).size()); String background = baseClassifiers.get(0).flags.backgroundSymbol; // baseLabels.get(i) points to the labels assigned by baseClassifiers.get(i) List<Set<String>> baseLabels = new ArrayList<Set<String>>(); Set<String> seenLabels = Generics.newHashSet(); for (AbstractSequenceClassifier<? extends CoreMap> baseClassifier : baseClassifiers) { Set<String> labs = baseClassifier.labels(); if (combinationMode != CombinationMode.HIGH_RECALL) { labs.removeAll(seenLabels); } else { labs.remove(baseClassifier.flags.backgroundSymbol); labs.remove(background); } seenLabels.addAll(labs); baseLabels.add(labs); } if (DEBUG) { for(int i = 0; i < baseLabels.size(); i ++) System.err.println("mergeDocuments: Using classifier #" + i + " for " + baseLabels.get(i)); System.err.println("mergeDocuments: Background symbol is " + background); System.err.println("Base model outputs:"); for( int i = 0; i < baseDocuments.size(); i ++){ System.err.printf("Output of model #%d:", i); for (IN l : baseDocuments.get(i)) { System.err.print(' '); System.err.print(l.get(CoreAnnotations.AnswerAnnotation.class)); } System.err.println(); } } // incrementally merge each additional model with the main model (i.e., baseDocuments.get(0)) // this keeps adding labels from the additional models to mainDocument // hence, when all is done, mainDocument contains the labels of all base models List<IN> mainDocument = baseDocuments.get(0); for (int i = 1; i < baseDocuments.size(); i ++) { mergeTwoDocuments(mainDocument, baseDocuments.get(i), baseLabels.get(i), background); } if (DEBUG) { System.err.print("Output of combined model:"); for (IN l: mainDocument) { System.err.print(' '); System.err.print(l.get(CoreAnnotations.AnswerAnnotation.class)); } System.err.println(); System.err.println(); } return mainDocument; } /** This merges in labels from the auxDocument into the mainDocument when * tokens have one of the labels in auxLabels, and the subsequence * labeled with this auxLabel does not conflict with any non-background * labelling in the mainDocument. */ static <INN extends CoreMap & HasWord> void mergeTwoDocuments(List<INN> mainDocument, List<INN> auxDocument, Set<String> auxLabels, String background) { boolean insideAuxTag = false; boolean auxTagValid = true; String prevAnswer = background; Collection<INN> constituents = new ArrayList<INN>(); Iterator<INN> auxIterator = auxDocument.listIterator(); for (INN wMain : mainDocument) { String mainAnswer = wMain.get(CoreAnnotations.AnswerAnnotation.class); INN wAux = auxIterator.next(); String auxAnswer = wAux.get(CoreAnnotations.AnswerAnnotation.class); boolean insideMainTag = !mainAnswer.equals(background); /* if the auxiliary classifier gave it one of the labels unique to auxClassifier, we might set the mainLabel to that. */ if (auxLabels.contains(auxAnswer)) { if ( ! prevAnswer.equals(auxAnswer) && ! prevAnswer.equals(background)) { if (auxTagValid){ for (INN wi : constituents) { wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); } } auxTagValid = true; constituents = new ArrayList<INN>(); } insideAuxTag = true; if (insideMainTag) { auxTagValid = false; } prevAnswer = auxAnswer; constituents.add(wMain); } else { if (insideAuxTag) { if (auxTagValid){ for (INN wi : constituents) { wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); } } constituents = new ArrayList<INN>(); } insideAuxTag=false; auxTagValid = true; prevAnswer = background; } } // deal with a sequence final auxLabel if (auxTagValid){ for (INN wi : constituents) { wi.set(CoreAnnotations.AnswerAnnotation.class, prevAnswer); } } } /** * Generates the AnswerAnnotation labels of the combined model for the given * tokens, storing them in place in the tokens. * * @param tokens A List of IN * @return The passed in parameters, which will have the AnswerAnnotation field added/overwritten */ @Override public List<IN> classify(List<IN> tokens) { if (baseClassifiers.isEmpty()) { return tokens; } List<List<IN>> baseOutputs = new ArrayList<List<IN>>(); // the first base model works in place, modifying the original tokens List<IN> output = baseClassifiers.get(0).classifySentence(tokens); // classify(List<IN>) is supposed to work in place, so add AnswerAnnotation to tokens! for (int i = 0, sz = output.size(); i < sz; i++) { tokens.get(i).set(CoreAnnotations.AnswerAnnotation.class, output.get(i).get(CoreAnnotations.AnswerAnnotation.class)); } baseOutputs.add(tokens); for (int i = 1, sz = baseClassifiers.size(); i < sz; i ++) { //List<CoreLabel> copy = deepCopy(tokens); // no need for deep copy: classifySentence creates a copy of the input anyway // List<CoreLabel> copy = tokens; output = baseClassifiers.get(i).classifySentence(tokens); baseOutputs.add(output); } assert(baseOutputs.size() == baseClassifiers.size()); List<IN> finalAnswer = mergeDocuments(baseOutputs); return finalAnswer; } @SuppressWarnings("unchecked") @Override public void train(Collection<List<IN>> docs, DocumentReaderAndWriter<IN> readerAndWriter) { throw new UnsupportedOperationException(); } @Override public void printProbsDocument(List<IN> document) { throw new UnsupportedOperationException(); } @Override public void serializeClassifier(String serializePath) { throw new UnsupportedOperationException(); } @Override public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException { throw new UnsupportedOperationException(); } @Override public List<IN> classifyWithGlobalInformation(List<IN> tokenSeq, CoreMap doc, CoreMap sent) { return classify(tokenSeq); } /** * Some basic testing of the ClassifierCombiner. * * @param args Command-line arguments as properties: -loadClassifier1 serializedFile -loadClassifier2 serializedFile * @throws Exception If IO or serialization error loading classifiers */ public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); ClassifierCombiner ec = new ClassifierCombiner(props); System.err.println(ec.classifyToString("Marketing : Sony Hopes to Win Much Bigger Market For Wide Range of Small-Video Products --- By Andrew B. Cohen Staff Reporter of The Wall Street Journal")); } }