package edu.stanford.nlp.sequences;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ShapeAnnotation;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.Americanize;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.util.AbstractIterator;
import java.util.*;
import java.util.regex.Pattern;
/**
* This class is used to wrap the ObjectBank used by the sequence
* models and is where any sort of general processing, like the IOB mapping
* stuff and wordshape stuff, should go.
* It checks the SeqClassifierFlags to decide what to do.
* <p>
* TODO: We should rearchitect this so that the FeatureFactory-specific
* stuff is done by a callback to the relevant FeatureFactory.
*
* @author Jenny Finkel
*/
public class ObjectBankWrapper extends ObjectBank<List<CoreLabel>> {
private static final long serialVersionUID = -3838331732026362075L;
private SeqClassifierFlags flags;
private ObjectBank<List<CoreLabel>> wrapped;
private Set<String> knownLCWords;
public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank<List<CoreLabel>> wrapped, Set<String> knownLCWords) {
super(null,null);
this.flags = flags;
this.wrapped = wrapped;
this.knownLCWords = knownLCWords;
}
@Override
public Iterator<List<CoreLabel>> iterator() {
Iterator<List<CoreLabel>> iter = new WrappedIterator(wrapped.iterator());
// If using WordShapeClassifier, we have to make an extra pass through the
// data before we really process it, so that we can build up the
// database of known lower case words in the data. We do that here.
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) {
while (iter.hasNext()) {
List<CoreLabel> doc = iter.next();
for (CoreLabel fl : doc) {
String word = fl.word();
if (word.length() > 0) {
char ch = word.charAt(0);
if (Character.isLowerCase(ch)) {
knownLCWords.add(word);
}
}
}
}
iter = new WrappedIterator(wrapped.iterator());
}
return iter;
}
private class WrappedIterator extends AbstractIterator<List<CoreLabel>> {
Iterator<List<CoreLabel>> wrappedIter;
Iterator<List<CoreLabel>> spilloverIter;
public WrappedIterator(Iterator<List<CoreLabel>> wrappedIter) {
this.wrappedIter = wrappedIter;
}
@Override
public boolean hasNext() {
return wrappedIter.hasNext() ||
(spilloverIter != null && spilloverIter.hasNext());
}
@Override
public List<CoreLabel> next() {
while (spilloverIter == null || !spilloverIter.hasNext()) {
List<CoreLabel> doc = wrappedIter.next();
List<List<CoreLabel>> docs = new ArrayList<List<CoreLabel>>();
docs.add(doc);
fixDocLengths(docs);
spilloverIter = docs.iterator();
}
return processDocument(spilloverIter.next());
}
}
public List<CoreLabel> processDocument(List<CoreLabel> doc) {
if (flags.mergeTags) { mergeTags(doc); }
if (flags.iobTags) { iobTags(doc); }
doBasicStuff(doc);
return doc;
}
private String intern(String s) {
if (flags.intern) {
return s.intern();
} else {
return s;
}
}
private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE);
private String fix(String word) {
if (flags.normalizeTerms || flags.normalizeTimex) {
// Same case for days/months: map to lowercase
if (monthDayPattern.matcher(word).matches()) {
return word.toLowerCase();
}
}
if (flags.normalizeTerms) {
return Americanize.americanize(word, false);
}
return word;
}
private void doBasicStuff(List<CoreLabel> doc) {
int position = 0;
for (CoreLabel fl : doc) {
// position in document
fl.set(PositionAnnotation.class, Integer.toString((position++)));
// word shape
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) {
String s = intern(WordShapeClassifier.wordShape(fl.word(), flags.wordShape, knownLCWords));
fl.set(ShapeAnnotation.class, s);
}
// normalizing and interning
// was the following; should presumably now be
// if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) {
if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) {
// for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency.
fl.set(CharAnnotation.class,intern(fix(fl.get(CharAnnotation.class))));
} else {
fl.setWord(intern(fix(fl.word())));
fl.set(GoldAnswerAnnotation.class, fl.get(AnswerAnnotation.class));
}
}
}
/**
* Take a {@link List} of documents (which are themselves {@link List}s
* of {@link CoreLabel}s) and if any are longer than the length
* specified by flags.maxDocSize split them up. It tries to be smart
* and split on sentence bounaries, hard-coded to the English-specific token
* '.'.
* TODO: This implementation is broken. It fails on a zero length document:
* after while loop it doesn't get added, as empty, it gets removed from the list,
* and then i doesn't increment because no new document.
*/
private void fixDocLengths(List<List<CoreLabel>> docs) {
int maxSize = flags.maxDocSize;
if (maxSize <= 0) {
return;
}
for (int i = 0; i < docs.size(); i++) {
List<CoreLabel> document = docs.get(i);
List<List<CoreLabel>> newDocuments = new ArrayList<List<CoreLabel>>();
while (document.size() > maxSize) {
int splitIndex = 0;
for (int j = maxSize; j > maxSize / 2; j--) {
CoreLabel wi = document.get(j);
if (wi.word().equals(".")) {
splitIndex = j + 1;
break;
}
}
if (splitIndex == 0) {
splitIndex = maxSize;
}
List<CoreLabel> newDoc = document.subList(0, splitIndex);
newDocuments.add(newDoc);
document = document.subList(splitIndex, document.size());
}
if ( ! document.isEmpty()) {
newDocuments.add(document);
}
docs.remove(i);
Collections.reverse(newDocuments);
for (List<CoreLabel> item : newDocuments) {
docs.add(i, item);
}
i += newDocuments.size() - 1;
}
}
private void iobTags(List<CoreLabel> doc) {
String lastTag = "";
for (CoreLabel wi : doc) {
String answer = wi.get(AnswerAnnotation.class);
if (!answer.equals(flags.backgroundSymbol)) {
int index = answer.indexOf('-');
String prefix;
String label;
if (index < 0) {
prefix = "";
label = answer;
} else {
prefix = answer.substring(0,1);
label = answer.substring(2);
}
if (!prefix.equals("B")) {
if (!lastTag.equals(label)) {
wi.set(AnswerAnnotation.class, "B-" + label);
} else {
wi.set(AnswerAnnotation.class, "I-" + label);
}
}
lastTag = label;
} else {
lastTag = answer;
}
}
}
private void mergeTags(List<CoreLabel> doc) {
for (CoreLabel wi : doc) {
String answer = wi.get(AnswerAnnotation.class);
if (!answer.equals(flags.backgroundSymbol) && answer.indexOf('-') >= 0) {
answer = answer.substring(2);
}
wi.set(AnswerAnnotation.class, answer);
}
}
// all the other the crap from ObjectBank
@Override
public boolean add(List<CoreLabel> o) { return wrapped.add(o); }
@Override
public boolean addAll(Collection<? extends List<CoreLabel>> c) { return wrapped.addAll(c); }
@Override
public void clear() { wrapped.clear(); }
@Override
public void clearMemory() { wrapped.clearMemory(); }
public boolean contains(List<CoreLabel> o) { return wrapped.contains(o); }
@Override
public boolean containsAll(Collection<?> c) { return wrapped.containsAll(c); }
@Override
public boolean isEmpty() { return wrapped.isEmpty(); }
@Override
public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); }
public boolean remove(List<CoreLabel> o) { return wrapped.remove(o); }
@Override
public boolean removeAll(Collection<?> c) { return wrapped.removeAll(c); }
@Override
public boolean retainAll(Collection<?> c) { return wrapped.retainAll(c); }
@Override
public int size() { return wrapped.size(); }
@Override
public Object[] toArray() { return wrapped.toArray(); }
public List<CoreLabel>[] toArray(List<CoreLabel>[] o) { return wrapped.toArray(o); }
} // end class ObjectBankWrapper