package edu.stanford.nlp.sequences; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.process.Americanize; import edu.stanford.nlp.process.WordShapeClassifier; import edu.stanford.nlp.process.WordToSentenceProcessor; import edu.stanford.nlp.util.AbstractIterator; import edu.stanford.nlp.util.CoreMap; import java.util.*; import java.util.regex.Pattern; /** * This class is used to wrap the ObjectBank used by the sequence * models and is where any sort of general processing, like the IOB mapping * stuff and wordshape stuff, should go. * It checks the SeqClassifierFlags to decide what to do. * <p> * TODO: We should rearchitect this so that the FeatureFactory-specific * stuff is done by a callback to the relevant FeatureFactory. * * @author Jenny Finkel */ public class ObjectBankWrapper<IN extends CoreMap> extends ObjectBank<List<IN>> { private static final long serialVersionUID = -3838331732026362075L; private final SeqClassifierFlags flags; private final ObjectBank<List<IN>> wrapped; private final Set<String> knownLCWords; public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank<List<IN>> wrapped, Set<String> knownLCWords) { super(null, null); this.flags = flags; this.wrapped = wrapped; this.knownLCWords = knownLCWords; } @Override public Iterator<List<IN>> iterator() { Iterator<List<IN>> iter = new WrappedIterator(wrapped.iterator()); return iter; } private class WrappedIterator extends AbstractIterator<List<IN>> { Iterator<List<IN>> wrappedIter; Iterator<List<IN>> spilloverIter; public WrappedIterator(Iterator<List<IN>> wrappedIter) { this.wrappedIter = wrappedIter; } @Override public boolean hasNext() { while ((spilloverIter == null || !spilloverIter.hasNext()) && wrappedIter.hasNext()) { List<IN> doc = wrappedIter.next(); List<List<IN>> docs = new ArrayList<>(); docs.add(doc); fixDocLengths(docs); spilloverIter = docs.iterator(); } return wrappedIter.hasNext() || (spilloverIter != null && spilloverIter.hasNext()); } @Override public List<IN> next() { // this while loop now is redundant because it should // have already been done in "hasNext". // I'm keeping it so that the diff is minimal. // -pichuan while (spilloverIter == null || !spilloverIter.hasNext()) { List<IN> doc = wrappedIter.next(); List<List<IN>> docs = new ArrayList<>(); docs.add(doc); fixDocLengths(docs); spilloverIter = docs.iterator(); } return processDocument(spilloverIter.next()); } } public List<IN> processDocument(List<IN> doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; } private String intern(String s) { if (flags.intern) { return s.intern(); } else { return s; } } private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE); private String fix(String word) { if (flags.normalizeTerms || flags.normalizeTimex) { // Same case for days/months: map to lowercase if (monthDayPattern.matcher(word).matches()) { return word.toLowerCase(); } } if (flags.normalizeTerms) { return Americanize.americanize(word, false); } return word; } private void doBasicStuff(List<IN> doc) { int position = 0; for (IN fl : doc) { // position in document fl.set(CoreAnnotations.PositionAnnotation.class, Integer.toString((position++))); // word shape if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) { // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment, // we should use that FeatureFactory's getWord function String word = fl.get(CoreAnnotations.TextAnnotation.class); if (flags.wordFunction != null) { word = flags.wordFunction.apply(word); } if ( ! word.isEmpty() && Character.isLowerCase(word.codePointAt(0))) { knownLCWords.add(word); } String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords)); fl.set(CoreAnnotations.ShapeAnnotation.class, s); } // normalizing and interning // was the following; should presumably now be // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) { if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) { // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency. fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class)))); } else { fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class)))); // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place. if (fl.get(CoreAnnotations.AnswerAnnotation.class) == null) { fl.set(CoreAnnotations.GoldAnswerAnnotation.class, fl.get(CoreAnnotations.AnswerAnnotation.class)); } } } } /** * Take a {@link List} of documents (which are themselves {@link List}s * of something that extends {@link CoreMap}, CoreLabel by default), * and if any are longer than the length * specified by flags.maxDocSize split them up. If maxDocSize is negative, * nothing is changed. In practice, documents need to be not too long or * else the CRF inference will fail due to numerical problems. * This method tries to be smart * and split on sentence boundaries, but this is hard-coded to English. * * @param docs The list of documents whose length might be adjusted. */ private void fixDocLengths(List<List<IN>> docs) { final int maxDocSize = flags.maxDocSize; WordToSentenceProcessor<IN> wts = new WordToSentenceProcessor<>(); List<List<IN>> newDocuments = new ArrayList<>(); for (List<IN> document : docs) { if (maxDocSize <= 0 || document.size() <= maxDocSize) { if (flags.keepEmptySentences || !document.isEmpty()) { newDocuments.add(document); } continue; } List<List<IN>> sentences = wts.process(document); List<IN> newDocument = new ArrayList<>(); for (List<IN> sentence : sentences) { if (newDocument.size() + sentence.size() > maxDocSize) { if (!newDocument.isEmpty()) { newDocuments.add(newDocument); } newDocument = new ArrayList<>(); } newDocument.addAll(sentence); } if (flags.keepEmptySentences || !newDocument.isEmpty()) { newDocuments.add(newDocument); } } docs.clear(); docs.addAll(newDocuments); } private void iobTags(List<IN> doc) { String lastTag = ""; for (IN wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); if (!flags.backgroundSymbol.equals(answer) && answer != null) { int index = answer.indexOf('-'); String prefix; String label; if (index < 0) { prefix = ""; label = answer; } else { prefix = answer.substring(0,index); label = answer.substring(index+1); } if (!prefix.equals("B")) { if (!label.equals(lastTag)) { wi.set(CoreAnnotations.AnswerAnnotation.class, "B-" + label); } else { wi.set(CoreAnnotations.AnswerAnnotation.class, "I-" + label); } } lastTag = label; } else { lastTag = answer; } } } /** Change some form of IOB/IOE encoding via forms like "I-PERS" to * IO encoding as just "PERS". * * @param doc The document for which the AnswerAnnotation will be changed (in place) */ private void mergeTags(List<IN> doc) { for (IN wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); if (answer == null) { continue; } if ( ! answer.equals(flags.backgroundSymbol)) { int index = answer.indexOf('-'); if (index >= 0) { answer = answer.substring(index + 1); } } wi.set(CoreAnnotations.AnswerAnnotation.class, answer); } } // all the other the crap from ObjectBank @Override public boolean add(List<IN> o) { return wrapped.add(o); } @Override public boolean addAll(Collection<? extends List<IN>> c) { return wrapped.addAll(c); } @Override public void clear() { wrapped.clear(); } @Override public void clearMemory() { wrapped.clearMemory(); } public boolean contains(List<IN> o) { return wrapped.contains(o); } @Override public boolean containsAll(Collection<?> c) { return wrapped.containsAll(c); } @Override public boolean isEmpty() { return wrapped.isEmpty(); } @Override public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); } public boolean remove(List<IN> o) { return wrapped.remove(o); } @Override public boolean removeAll(Collection<?> c) { return wrapped.removeAll(c); } @Override public boolean retainAll(Collection<?> c) { return wrapped.retainAll(c); } @Override public int size() { return wrapped.size(); } @Override public Object[] toArray() { return wrapped.toArray(); } public List<IN>[] toArray(List<IN>[] o) { return wrapped.toArray(o); } } // end class ObjectBankWrapper