// AbstractSequenceClassifier -- a framework for probabilistic sequence models. // Copyright (c) 2002-2008 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // Support/Questions: java-nlp-user@lists.stanford.edu // Licensing: java-nlp-support@lists.stanford.edu // http://nlp.stanford.edu/downloads/crf-classifier.shtml package edu.stanford.nlp.ie; import edu.stanford.nlp.fsm.DFSA; import edu.stanford.nlp.io.RegExFileFilter; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.*; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Sentence; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.objectbank.ResettableReaderIteratorFactory; import edu.stanford.nlp.sequences.*; import edu.stanford.nlp.sequences.FeatureFactory; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.stats.Sampler; import edu.stanford.nlp.util.*; import java.io.*; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.*; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; /** This class provides common functionality for (probabilistic) sequence * models. It is a superclass of our CMM and CRF sequence classifiers, * and is even used in the (deterministic) NumberSequenceClassifier. * See implementing classes for more information. * * @author Jenny Finkel * @author Dan Klein * @author Christopher Manning * @author Dan Cer */ public abstract class AbstractSequenceClassifier implements Function<String, String> { public static final String JAR_CLASSIFIER_PATH = "/classifiers/"; public SeqClassifierFlags flags; public Index<String> classIndex; // = null; protected DocumentReaderAndWriter readerAndWriter; // = null; public FeatureFactory featureFactory; protected CoreLabel pad; public int windowSize; protected Set<String> knownLCWords = new HashSet<String>(); /** Construct a SeqClassifierFlags object based on the passed in properties, * and then call the other constructor. * * @param props See SeqClassifierFlags for known properties. */ public AbstractSequenceClassifier(Properties props) { this(new SeqClassifierFlags(props)); } /** Initialize the featureFactor and other variables based on the passed in * flags. * * @param flags A specification of the AbstractSequenceClassifier to construct. */ public AbstractSequenceClassifier(SeqClassifierFlags flags) { this.flags = flags; pad = new CoreLabel(); windowSize = flags.maxLeft + 1; try { featureFactory = (FeatureFactory) Class.forName(flags.featureFactory).newInstance(); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage()); } reinit(); } /** This method should be called after there have been changes to the * flags (SeqClassifierFlags) variable, such as after deserializing * a classifier. It is called inside the loadClassifier methods. * It assumes that the flags variable and the pad * variable exist, but reinitializes things like the pad variable, * featureFactory and readerAndWriter based on the flags. * <p> * <i>Implementation note:</i> At the moment this variable doesn't * set windowSize or featureFactory, since they are being serialized * separately in the * file, but we should probably stop serializing them and just * reinitialize them from the flags? */ protected final void reinit() { pad.set(AnswerAnnotation.class, flags.backgroundSymbol); pad.set(GoldAnswerAnnotation.class, flags.backgroundSymbol); try { readerAndWriter = (DocumentReaderAndWriter) Class.forName(flags.readerAndWriter).newInstance(); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage(), e); } readerAndWriter.init(flags); featureFactory.init(flags); } public String backgroundSymbol() { return flags.backgroundSymbol; } public Set<String> labels() { return new HashSet<String>(classIndex.objectsList()); } /** * Classify a {@link Sentence}. * * @param sentence The {@link Sentence} to be classified. * @return The classified {@link Sentence}, where the classifier output for * each token is stored in its "answer" field. */ public List<CoreLabel> classifySentence(List<? extends HasWord> sentence) { List<CoreLabel> document = new ArrayList<CoreLabel>(); int i = 0; for (HasWord word : sentence) { CoreLabel wi = new CoreLabel(); wi.setWord(word.word()); wi.set(PositionAnnotation.class, Integer.toString(i)); wi.set(AnswerAnnotation.class, backgroundSymbol()); document.add(wi); i++; } ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords); wrapper.processDocument(document); classify(document); return document; } public SequenceModel getSequenceModel(List<? extends CoreLabel> doc) { throw new UnsupportedOperationException(); } public Sampler<List<CoreLabel>> getSampler(final List<? extends CoreLabel> input) { return new Sampler<List<CoreLabel>>() { SequenceModel model = getSequenceModel(input); SequenceSampler sampler = new SequenceSampler(); public List<CoreLabel> drawSample() { int[] sampleArray = sampler.bestSequence(model); List<CoreLabel> sample = new ArrayList<CoreLabel>(); int i=0; for (CoreLabel word : input) { CoreLabel newWord = new CoreLabel(word); newWord.set(AnswerAnnotation.class, classIndex.get(sampleArray[i++])); sample.add(newWord); } return sample; } }; } public Counter<List<CoreLabel>> classifyKBest(List<CoreLabel> doc, Class<? extends CoreAnnotation<String>> answerField, int k) { if (doc.isEmpty()) { return new ClassicCounter<List<CoreLabel>>(); } // i'm sorry that this is so hideous - JRF ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); KBestSequenceFinder tagInference = new KBestSequenceFinder(); Counter<int[]> bestSequences = tagInference.kBestSequences(model,k); Counter<List<CoreLabel>> kBest = new ClassicCounter<List<CoreLabel>>(); for (int[] seq : bestSequences.keySet()) { List<CoreLabel> kth = new ArrayList<CoreLabel>(); int pos = model.leftWindow(); for (CoreLabel fi : doc) { CoreLabel newFL = new CoreLabel(fi); String guess = classIndex.get(seq[pos]); fi.remove(AnswerAnnotation.class); // because fake answers will get added during testing newFL.set(answerField, guess); pos++; kth.add(newFL); } kBest.setCount(kth, bestSequences.getCount(seq)); } return kBest; } @SuppressWarnings({"UnusedDeclaration"}) public DFSA<String, Integer> getViterbiSearchGraph(List<CoreLabel> doc, Class<? extends CoreAnnotation<String>> answerField) { if (doc.isEmpty()) { return new DFSA<String, Integer>(null); } ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); return ViterbiSearchGraphBuilder.getGraph(model, classIndex); } /** * Classify a List of CoreLabels using a TrueCasingDocumentReader. * <i>Note:</i> This was fairly quickly added to build a Truecaser. It may * be revised or disappear. * * @param sentence a list of CoreLabels to be classifierd * @return The classified list}. */ public List<CoreLabel> classifyWithCasing(List<CoreLabel> sentence) { List<CoreLabel> document = new ArrayList<CoreLabel>(); int i = 0; for (CoreLabel word : sentence) { CoreLabel wi = new CoreLabel(); if (readerAndWriter instanceof TrueCasingDocumentReaderAndWriter) { wi.setWord(word.word().toLowerCase()); if (flags.useUnknown) { wi.set(UnknownAnnotation.class, (TrueCasingDocumentReaderAndWriter.known(wi.word()) ? "false" : "true")); //System.err.println(wi.word()+" : "+wi.get("unknown")); } } else { wi.setWord(word.word()); } wi.set(PositionAnnotation.class, Integer.toString(i)); wi.set(AnswerAnnotation.class, backgroundSymbol()); document.add(wi); i++; } classify(document); i = 0; for (CoreLabel wi : document) { CoreLabel word = sentence.get(i); if (flags.readerAndWriter.equalsIgnoreCase("edu.stanford.nlp.sequences.TrueCasingDocumentReader")) { String w = word.word(); if (wi.get(AnswerAnnotation.class).equals("INIT_UPPER") || wi.get(PositionAnnotation.class).equals(flags.backgroundSymbol)) { w = w.substring(0,1).toUpperCase()+w.substring(1).toLowerCase(); } else if (wi.get(AnswerAnnotation.class).equals("LOWER")) { w = w.toLowerCase(); } else if (wi.get(AnswerAnnotation.class).equals("UPPER")) { w = w.toUpperCase(); } word.setWord(w); } else { word.setNER(wi.get(AnswerAnnotation.class)); } i++; } return sentence; } /** * Classify the tokens in a String. Each sentence becomes a separate * document. * * @param str A String with tokens in one or more sentences of text * to be classified. * @return {@link List} of classified sentences (each a List of * {@link CoreLabel}s). */ public List<List<CoreLabel>> classify(String str) { DocumentReaderAndWriter oldRW = readerAndWriter; readerAndWriter = new PlainTextDocumentReaderAndWriter(); readerAndWriter.init(flags); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromString(str); List<List<CoreLabel>> result = new ArrayList<List<CoreLabel>>(); for (List<CoreLabel> document : documents) { classify(document); List<CoreLabel> sentence = new ArrayList<CoreLabel>(); for (CoreLabel wi : document) { // TaggedWord word = new TaggedWord(wi.word(), wi.answer()); // sentence.add(word); sentence.add(wi); } result.add(sentence); } readerAndWriter = oldRW; return result; } /** * Classify the contents of a file. * * @param filename Contains the sentence(s) to be classified. * @return {@link List} of classified {@link Sentence}s. */ public List<List<CoreLabel>> classifyFile(String filename) { DocumentReaderAndWriter oldRW = readerAndWriter; readerAndWriter = new PlainTextDocumentReaderAndWriter(); readerAndWriter.init(flags); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFile(filename); List<List<CoreLabel>> result = new ArrayList<List<CoreLabel>>(); for (List<CoreLabel> document : documents) { // System.err.println(document); classify(document); List<CoreLabel> sentence = new ArrayList<CoreLabel>(); for (CoreLabel wi : document) { sentence.add(wi); // System.err.println(wi); } result.add(sentence); } readerAndWriter = oldRW; return result; } /** * Maps a String input to an XML-formatted rendition of applying NER to * the String. Implements the Function interface. Calls * classifyWithInlineXML(String) [q.v.]. */ public String apply(String in) { return classifyWithInlineXML(in); } /** * Classify the contents of a {@link String}. Plain text or XML input is * expected and the {@link PlainTextDocumentReaderAndWriter} is used. * The classifier will tokenize the text and treat each sentence as a * separate document. * The output can be specified to be in a choice of three formats: slashTags * (e.g., Bill/PERSON Smith/PERSON died/O ./O), inlineXML * (e.g., <PERSON>Bill Smith</PERSON> * went to <LOCATION>Paris</LOCATION> .), or xml, for stand-off * XML (e.g., <wi num="0" entity="PERSON">Sue</wi> * <wi num="1" entity="O">shouted</wi> ). * There is also a binary choice as to whether the spacing between tokens * of the original is preserved or whether the (tagged) tokens are printed * with a single space (for inlineXML or slashTags) or a single newline * (for xml) between each one. * <p> * <i>Fine points:</i> * The slashTags and xml formats show tokens as transformed * by any normalization processes inside the tokenizer, while inlineXML * shows the tokens exactly as they appeared in the source text. * When a period counts as both part of an abbreviation and as an end of * sentence marker, it is included twice in the output String for slashTags * or xml, but only once for inlineXML, where it is not counted as part of * the abbreviation (or any named entity it is part of). For slashTags with * preserveSpacing=true, there will be two successive periods such as "Jr.." * The tokenized (preserveSpacing=false) output will have a space or a * newline after the last token. * * @param sentences The String to be classified. It will be tokenized and * divided into documents according to (heuristically determined) * sentence boundaries. * @param outputFormat The format to put the output in: one of "slashTags", * "xml", or "inlineXML" * @param preserveSpacing Whether to preserve the input spacing between * tokens, which may sometimes be none (true) or whether to tokenize * the text and print it with one space between each token (false) * @return A {@link String} with annotated with classification * information. */ public String classifyToString(String sentences, String outputFormat, boolean preserveSpacing) { int outFormat = PlainTextDocumentReaderAndWriter.asIntOutputFormat(outputFormat); DocumentReaderAndWriter tmp = readerAndWriter; readerAndWriter = new PlainTextDocumentReaderAndWriter(); readerAndWriter.init(flags); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromString(sentences); StringBuilder sb = new StringBuilder(); for (List<CoreLabel> doc : documents) { classify(doc); sb.append(((PlainTextDocumentReaderAndWriter) readerAndWriter).getAnswers(doc, outFormat, preserveSpacing)); } readerAndWriter = tmp; return sb.toString(); } /** * Classify the contents of a {@link String}. Plain text or XML is * expected and the {@link PlainTextDocumentReaderAndWriter} is used. * The classifier will treat each sentence as a separate document. * The output can be specified to be in a choice of formats: * Output * is in inline XML format (e.g. <PERSON>Bill Smith</PERSON> * went to <LOCATION>Paris</LOCATION> .) * * @param sentences The string to be classified * @return A {@link String} with annotated with classification * information. */ public String classifyWithInlineXML(String sentences) { return classifyToString(sentences, "inlineXML", true); } /** * Classify the contents of a String to a tagged word/class String. * Plain text or XML input is * expected and the {@link PlainTextDocumentReaderAndWriter} is used. Output * looks like: My/O name/O is/O Bill/PERSON Smith/PERSON ./O * * @param sentences The String to be classified * @return A String annotated with classification * information. */ public String classifyToString(String sentences) { return classifyToString(sentences, "slashTags", true); } /** * Classify the contents of a {@link String}. Plain text or XML input text * is expected and the {@link PlainTextDocumentReaderAndWriter} is used. * Output is a (possibly empty, but not <code>null</code> List of Triples. * Each Triple is an entity name, followed by beginning and ending * character offsets in the original String. * Character offsets can be thought of as fenceposts between the characters, * or, like certain methods in the Java String class, as character positions, * numbered starting from 0, with the end index pointing to the position * AFTER the entity ends. That is, end - start is the length of the entity * in characters. * <p> * <i>Fine points:</i> Token offsets are true wrt the source text, even though * the tokenizer may internally normalize certain tokens to String * representations of different lengths (e.g., " becoming `` or ''). * When a period counts as both part of an abbreviation and as an end of * sentence marker, and that abbreviation is part of a named entity, * the reported entity string excludes the period. * * @param sentences The string to be classified * @return A {@link List} of {@link Triple}s, each of which gives an entity * type and the beginning and ending character offsets. */ public List<Triple<String,Integer,Integer>> classifyToCharacterOffsets(String sentences) { DocumentReaderAndWriter tmp = readerAndWriter; readerAndWriter = new PlainTextDocumentReaderAndWriter(); readerAndWriter.init(flags); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromString(sentences); readerAndWriter = tmp; List<Triple<String,Integer,Integer>> entities = new ArrayList<Triple<String,Integer,Integer>>(); for (List<CoreLabel> doc : documents) { String prevEntityType = flags.backgroundSymbol; Triple<String,Integer,Integer> prevEntity = null; classify(doc); for (CoreLabel fl : doc) { String guessedAnswer = fl.get(AnswerAnnotation.class); if (guessedAnswer.equals(flags.backgroundSymbol)) { if (prevEntity != null) { entities.add(prevEntity); prevEntity = null; } } else { if ( ! guessedAnswer.equals(prevEntityType)) { if (prevEntity != null) { entities.add(prevEntity); } prevEntity = new Triple<String,Integer,Integer>(guessedAnswer, fl.get(BeginPositionAnnotation.class), fl.get(EndPositionAnnotation.class)); } else { assert prevEntity != null; // if you read the code carefully, this should always be true! prevEntity.setThird(fl.get(EndPositionAnnotation.class)); } } prevEntityType = guessedAnswer; } // include any entity at end of doc if (prevEntity != null) { entities.add(prevEntity); } } return entities; } /** * ONLY USE IF LOADED A CHINESE WORD SEGMENTER!!!!! * * @param sentence The string to be classified * @return List of words */ public List<String> segmentString(String sentence) { ObjectBank<List<CoreLabel>> docs = makeObjectBankFromString(sentence); // @ cer - previously, there was the following todo here: // // TODO: use printAnswers(List<CoreLabel> doc, PrintWriter pw) // instead // // I went ahead and did the TODO. However, given that the TODO // was incredibly easy to do, I'm wondering if it was left // as a todo for a reason. For example, I'm concerned that something // else bizarrely breaks if this method calls printAnswers, as the method // arguably should, instead of manually building up the output string, // as was being done before. // // In any case, by doing the TODO, I was able to improve the online // parser/segmenter since all of the wonderful post processing // stuff is now being done to the segmented strings. // // However, if anything I'm not aware of broke, please just shot me // an e-mail (cerd@cs.colorado.edu) and I will look into and fix // the problem asap. // Also... // // Using a temporary file for flags.testFile is not elegant // However, I think all more elegant solutions would require // touching more source files. Touching more source files // risks incurring the wrath of whoever regularly works-with // and/or 'owns' this part of the codebase. // // (...the testFile stuff is necessary for segmentation whitespace // normalization) String oldTestFile = flags.testFile; try { File tempFile = File.createTempFile("segmentString", ".txt"); tempFile.deleteOnExit(); flags.testFile = tempFile.getPath(); FileWriter tempWriter = new FileWriter(tempFile); tempWriter.write(sentence); tempWriter.close(); } catch (IOException e) { System.err.println("Warning(segmentString): " + "couldn't create temporary file for flags.testFile"); flags.testFile = ""; } StringWriter stringWriter = new StringWriter(); PrintWriter stringPrintWriter = new PrintWriter(stringWriter); for (List<CoreLabel> doc : docs) { classify(doc); readerAndWriter.printAnswers(doc, stringPrintWriter); stringPrintWriter.println(); } stringPrintWriter.close(); String segmented = stringWriter.toString(); flags.testFile = oldTestFile; return Arrays.asList(segmented.split("\\s")); } /** * Classify the contents of {@link SeqClassifierFlags scf.testFile}. * The file should be in the format * expected based on {@link SeqClassifierFlags scf.documentReader}. * * @return A {@link List} of {@link List}s of classified * {@link CoreLabel}s where each * {@link List} refers to a document/sentence. */ // public ObjectBank<List<CoreLabel>> test() { // return test(flags.testFile); // } /** * Classify the contents of a file. The file should be in the format * expected based on {@link SeqClassifierFlags scf.documentReader} if the * file is specified in {@link SeqClassifierFlags scf.testFile}. If the * file being read is from {@link SeqClassifierFlags scf.textFile} then * the {@link PlainTextDocumentReaderAndWriter} is used. * * @param filename The path to the specified file * @return A {@link List} of {@link List}s of classified {@link CoreLabel}s where each * {@link List} refers to a document/sentence. */ // public ObjectBank<List<CoreLabel>> test(String filename) { // // only for the OCR data does this matter // flags.ocrTrain = false; // ObjectBank<List<CoreLabel>> docs = makeObjectBank(filename); // return testDocuments(docs); // } /** * Classify a {@link List} of {@link CoreLabel}s. * * @param document A {@link List} of {@link CoreLabel}s. * @return the same {@link List}, but with the elements annotated * with their answers (with <code>setAnswer()</code>). */ public abstract List<CoreLabel> classify(List<CoreLabel> document); /** Train the classifier based on values in flags. It will use the first * of these variables that is defined: trainFiles (and baseTrainDir), * trainFileList, trainFile. */ public void train() { if (flags.trainFiles != null) { train(flags.baseTrainDir, flags.trainFiles); } else if (flags.trainFileList != null) { String[] files = flags.trainFileList.split(","); train(files); } else { train(flags.trainFile); } } public void train(String filename) { // only for the OCR data does this matter flags.ocrTrain = true; train(makeObjectBankFromFile(filename)); } public void train(String baseTrainDir, String trainFiles) { // only for the OCR data does this matter flags.ocrTrain = true; train(makeObjectBankFromFiles(baseTrainDir, trainFiles)); } public void train(String[] trainFileList) { // only for the OCR data does this matter flags.ocrTrain = true; train(makeObjectBankFromFiles(trainFileList)); } public abstract void train(ObjectBank<List<CoreLabel>> docs); /** * Reads a String into an ObjectBank object. * NOTE: that the current implementation of ReaderIteratorFactory will first * try to interpret each string as a filename, so this method * will yield unwanted results if it applies to a string that is * at the same time a filename. It prints out a warning, at least. * * @param string The String which will be the content of the ObjectBank * (ASSUMING THAT NO FILE OF THIS NAME EXISTS!) * @return The ObjectBank */ public ObjectBank<List<CoreLabel>> makeObjectBankFromString(String string) { // try to interpret as a file to throw warning. File file = new File(string); if (file.exists()) { System.err.println("Warning: calling makeObjectBankFromString with an existing file name! This will open the file instead."); } if (flags.announceObjectBankEntries) { System.err.print("Reading data using "); System.err.println(flags.readerAndWriter); if (flags.inputEncoding == null) { System.err.println("Getting data from " + string + " (default encoding)"); } else { System.err.println("Getting data from " + string + " (" + flags.inputEncoding + " encoding)"); } } return new ObjectBankWrapper(flags, new ObjectBank<List<CoreLabel>>(new ResettableReaderIteratorFactory(string), readerAndWriter), knownLCWords); } public ObjectBank<List<CoreLabel>> makeObjectBankFromFile(String filename) { String[] fileAsArray = {filename}; return makeObjectBankFromFiles(fileAsArray); } public ObjectBank<List<CoreLabel>> makeObjectBankFromFiles(String[] trainFileList) { //try{ Collection<File> files = new ArrayList<File>(); for (String trainFile : trainFileList) { File f = new File(trainFile); files.add(f); } // System.err.printf("trainFileList contains %d file%s.\n", files.size(), files.size() == 1 ? "": "s"); return new ObjectBankWrapper(flags, new ObjectBank<List<CoreLabel>>(new ResettableReaderIteratorFactory(files), readerAndWriter), knownLCWords); //} catch (IOException e) { //throw new RuntimeException(e); //} } public ObjectBank<List<CoreLabel>> makeObjectBankFromFiles(String baseDir, String filePattern) { try { File path = new File(baseDir); FileFilter filter = new RegExFileFilter(Pattern.compile(filePattern)); File[] origFiles = path.listFiles(filter); Collection<BufferedReader> files = new ArrayList<BufferedReader>(); for (File file : origFiles) { if (file.isFile()) { if (flags.inputEncoding == null) { if (flags.announceObjectBankEntries) { System.err.println("Getting data from " + file + " (default encoding)"); } files.add(new BufferedReader(new InputStreamReader(new FileInputStream(file)))); } else { if (flags.announceObjectBankEntries) { System.err.println("Getting data from " + file + " (" + flags.inputEncoding + " encoding)"); } files.add(new BufferedReader(new InputStreamReader(new FileInputStream(file), flags.inputEncoding))); } } } if (files.isEmpty()) { throw new RuntimeException("No matching files: " + baseDir + '\t' + filePattern); } return new ObjectBankWrapper(flags, new ObjectBank<List<CoreLabel>>(new ResettableReaderIteratorFactory(files), readerAndWriter), knownLCWords); } catch (IOException e) { throw new RuntimeException(e); } } public ObjectBank<List<CoreLabel>> makeObjectBankFromFiles(Collection<File> files) { if (files.isEmpty()) { throw new RuntimeException("Attempt to make ObjectBank with empty file list"); } return new ObjectBankWrapper(flags, new ObjectBank<List<CoreLabel>>(new ResettableReaderIteratorFactory(files), readerAndWriter), knownLCWords); } /** Set up an ObjectBank that will allow one to iterate over a * collection of documents obtained from the passed in Reader. * Each document will be represented as a list of CoreLabel. * If the ObjectBank iterator() is called until hasNext() returns false, * then the Reader will be read till end of file, but no * reading is done at the time of this call. Reading is done using the * reading method specified in <code>flags.documentReader</code>, * and for some reader choices, the column mapping given in * <code>flags.map</code>. * * @param in Input data * addNEWLCWords do we add new lowercase words from this data to the word shape classifier * @return The list of documents */ protected ObjectBank<List<CoreLabel>> makeObjectBankFromReader(BufferedReader in) { if (flags.announceObjectBankEntries) { System.err.print("Reading data using "); System.err.println(flags.readerAndWriter); } return new ObjectBankWrapper(flags, new ObjectBank<List<CoreLabel>>(new ResettableReaderIteratorFactory(in), readerAndWriter), knownLCWords); } /** * Takes the file, reads it in, and prints out the likelihood of * each possible label at each point. * * @param filename The path to the specified file */ public void printProbs(String filename) { // only for the OCR data does this matter flags.ocrTrain = false; ObjectBank<List<CoreLabel>> docs = makeObjectBankFromFile(filename); printProbsDocuments(docs); } /** * Takes a {@link List} of documents and prints the likelihood * of each possible label at each point. * * @param documents A {@link List} of {@link List} of {@link CoreLabel}s. */ public void printProbsDocuments(ObjectBank<List<CoreLabel>> documents) { for (List<CoreLabel> doc : documents) { printProbsDocument(doc); System.out.println(); } } public abstract void printProbsDocument(List<CoreLabel> document); /** Load a test file, run the classifier on it, and then print the answers * to stdout (with timing to stderr). This uses the value of * flags.documentReader to determine testFile format. * * @param testFile The file to test on. */ public void classifyAndWriteAnswers(String testFile) throws Exception { ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFile(testFile); classifyAndWriteAnswers(documents); } public void classifyAndWriteAnswers(String baseDir, String filePattern) throws Exception { ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFiles(baseDir, filePattern); classifyAndWriteAnswers(documents); } public void classifyAndWriteAnswers(Collection<File> testFiles) throws Exception{ ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFiles(testFiles); classifyAndWriteAnswers(documents); } private void classifyAndWriteAnswers(ObjectBank<List<CoreLabel>> documents) throws Exception { Timing timer = new Timing(); int numWords = 0; int numDocs = 0; for (List<CoreLabel> doc : documents) { classify(doc); numWords += doc.size(); writeAnswers(doc); numDocs++; } long millis = timer.stop(); double wordspersec = numWords / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println(StringUtils.getShortClassName(this) + " tagged " + numWords + " words in " + numDocs + " documents at " + nf.format(wordspersec) + " words per second."); } /** Load a test file, run the classifier on it, and then print the answers * to stdout (with timing to stderr). This uses the value of * flags.documentReader to determine testFile format. * * @param testFile The file to test on. */ public void classifyAndWriteAnswersKBest(String testFile, int k) throws Exception { Timing timer = new Timing(); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFile(testFile); int numWords = 0; int numSentences = 0; for (List<CoreLabel> doc : documents) { Counter<List<CoreLabel>> kBest = classifyKBest(doc, AnswerAnnotation.class, k); numWords += doc.size(); List<List<CoreLabel>> sorted = Counters.toSortedList(kBest); int n = 1; for (List<CoreLabel> l : sorted) { System.out.println("<sentence id="+numSentences+" k="+n+" logProb="+kBest.getCount(l)+" prob="+Math.exp(kBest.getCount(l))+ '>'); writeAnswers(l); System.out.println("</sentence>"); n++; } numSentences++; } long millis = timer.stop(); double wordspersec = numWords / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println(this.getClass().getName()+" tagged " + numWords + " words in " + numSentences + " documents at " + nf.format(wordspersec) + " words per second."); } /** Load a test file, run the classifier on it, and then write a Viterbi search graph for * each sequence. * * @param testFile The file to test on. */ public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGraphPrefix) throws Exception { Timing timer = new Timing(); ObjectBank<List<CoreLabel>> documents = makeObjectBankFromFile(testFile); int numWords = 0; int numSentences = 0; for (List<CoreLabel> doc : documents) { DFSA<String, Integer> tagLattice = getViterbiSearchGraph(doc, AnswerAnnotation.class); numWords += doc.size(); PrintWriter latticeWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix+ '.' +numSentences+".wlattice")); PrintWriter vsgWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix+ '.' +numSentences+".lattice")); if(readerAndWriter instanceof LatticeWriter) ((LatticeWriter)readerAndWriter).printLattice(tagLattice, doc, latticeWriter); tagLattice.printAttFsmFormat(vsgWriter); latticeWriter.close(); vsgWriter.close(); numSentences++; } long millis = timer.stop(); double wordspersec = numWords / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println(this.getClass().getName()+" tagged " + numWords + " words in " + numSentences + " documents at " + nf.format(wordspersec) + " words per second."); } /** Write the classifications of the Sequence classifier out * to stdout in a format * determined by the DocumentReaderAndWriter used. * If the flag <code>outputEncoding</code> is defined, the output * is written in that character encoding, otherwise in the system default * character encoding. * * @param doc Documents to write out * @throws Exception If an IO problem */ public void writeAnswers(List<CoreLabel> doc) throws Exception { if (flags.lowerNewgeneThreshold) { return; } if (flags.numRuns <= 1) { PrintWriter out; if (flags.outputEncoding == null) { out = new PrintWriter(System.out, true); } else { out = new PrintWriter(new OutputStreamWriter(System.out, flags.outputEncoding), true); } readerAndWriter.printAnswers(doc, out); // out.println(); out.flush(); } } /** Serialize a sequence classifier to a file on the given path. * * @param serializePath The path/filename to write the classifier to. */ public abstract void serializeClassifier(String serializePath); /** * Loads a classifier from the given input stream. * The JVM shuts down (System.exit(1)) if there is an exception. * This does not close the InputStream. * * @param in The InputStream to read from */ public void loadClassifierNoExceptions(InputStream in) { // load the classifier try { loadClassifier(in); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } /** Load a classsifier from the specified InputStream. * No extra properties are supplied. * This does not close the InputStream. * * @param in The InputStream to load the serialized classifier from * * @throws IOException If there are problems accessing the input stream * @throws ClassCastException If there are problems interpreting the serialized data * @throws ClassNotFoundException If there are problems interpreting the serialized data */ public void loadClassifier(InputStream in) throws IOException, ClassCastException, ClassNotFoundException { loadClassifier(in, null); } /** Load a classsifier from the specified InputStream. * The classifier is reinitialized from the flags serialized in the * classifier. * This does not close the InputStream. * * @param in The InputStream to load the serialized classifier from * @param props This Properties object will be used to update the SeqClassifierFlags which * are read from the serialized classifier * * @throws IOException If there are problems accessing the input stream * @throws ClassCastException If there are problems interpreting the serialized data * @throws ClassNotFoundException If there are problems interpreting the serialized data */ public void loadClassifier(InputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException { loadClassifier(new ObjectInputStream(in), props); } /** Load a classsifier from the specified input stream. * The classifier is reinitialized from the flags serialized in the * classifier. * * @param in The InputStream to load the serialized classifier from * @param props This Properties object will be used to update the SeqClassifierFlags which * are read from the serialized classifier * * @throws IOException If there are problems accessing the input stream * @throws ClassCastException If there are problems interpreting the serialized data * @throws ClassNotFoundException If there are problems interpreting the serialized data */ public abstract void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException; /** * Loads a classifier from the file specified by loadPath. If loadPath * ends in .gz, uses a GZIPInputStream, else uses a regular FileInputStream. */ public void loadClassifier(String loadPath) throws ClassCastException, IOException, ClassNotFoundException { loadClassifier(new File(loadPath)); } public void loadClassifierNoExceptions(String loadPath) { loadClassifierNoExceptions(new File(loadPath)); } public void loadClassifierNoExceptions(String loadPath, Properties props) { loadClassifierNoExceptions(new File(loadPath), props); } public void loadClassifier(File file) throws ClassCastException, IOException, ClassNotFoundException { loadClassifier(file, null); } /** * Loads a classifier from the file specified. If the file's name * ends in .gz, uses a GZIPInputStream, else uses a regular FileInputStream. * This method closes the File when done. * * @param file Loads a classifier from this file. * @param props Properties in this object will be used to overwrite those * specified in the serialized classifier * * @throws IOException If there are problems accessing the input stream * @throws ClassCastException If there are problems interpreting the serialized data * @throws ClassNotFoundException If there are problems interpreting the serialized data */ public void loadClassifier(File file, Properties props) throws ClassCastException, IOException, ClassNotFoundException { Timing.startDoing("Loading classifier from " + file.getAbsolutePath()); BufferedInputStream bis; if (file.getName().endsWith(".gz")) { bis = new BufferedInputStream(new GZIPInputStream(new FileInputStream(file))); } else { bis = new BufferedInputStream(new FileInputStream(file)); } loadClassifier(bis, props); bis.close(); Timing.endDoing(); } public void loadClassifierNoExceptions(File file) { loadClassifierNoExceptions(file, null); } public void loadClassifierNoExceptions(File file, Properties props) { try { loadClassifier(file, props); } catch (Exception e) { System.err.println("Error deserializing " + file.getAbsolutePath()); e.printStackTrace(); System.exit(1); } } /** * This function will load a classifier that is stored inside a jar file * (if it is so stored). The classifier should be specified as its full * filename, but the path in the jar file (<code>/classifiers/</code>) is * coded in this class. If the classifier is not stored in the jar file * or this is not run from inside a jar file, then this function will * throw a RuntimeException. * * @param modelName The name of the model file. Iff it ends in .gz, then * it is assumed to be gzip compressed. * @param props A Properties object which can override certain properties * in the serialized file, such as the DocumentReaderAndWriter. * You can pass in <code>null</code> to override nothing. */ public void loadJarClassifier(String modelName, Properties props) { Timing.startDoing("Loading JAR-internal classifier " + modelName); try { InputStream is = getClass().getResourceAsStream(JAR_CLASSIFIER_PATH + modelName); if (modelName.endsWith(".gz")) { is = new GZIPInputStream(is); } is = new BufferedInputStream(is); loadClassifier(is, props); is.close(); Timing.endDoing(); } catch (Exception e) { String msg = "Error loading classifier from jar file (most likely you are not running this code from a jar file or the named classifier is not stored in the jar file)"; throw new RuntimeException(msg, e); } } }