ObjectBankWrapper.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.sequences;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.Americanize;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;
import java.util.regex.Pattern;


/**
 * This class is used to wrap the ObjectBank used by the sequence
 * models and is where any sort of general processing, like the IOB mapping
 * stuff and wordshape stuff, should go.
 * It checks the SeqClassifierFlags to decide what to do.
 * <p>
 * TODO: We should rearchitect this so that the FeatureFactory-specific
 * stuff is done by a callback to the relevant FeatureFactory.
 *
 * @author Jenny Finkel
 */
public class ObjectBankWrapper<IN extends CoreMap> extends ObjectBank<List<IN>> {

  private static final long serialVersionUID = -3838331732026362075L;

  private final SeqClassifierFlags flags;
  private final ObjectBank<List<IN>> wrapped;
  private final Set<String> knownLCWords;


  public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank<List<IN>> wrapped, Set<String> knownLCWords) {
    super(null, null);
    this.flags = flags;
    this.wrapped = wrapped;
    this.knownLCWords = knownLCWords;
  }


  @Override
  public Iterator<List<IN>> iterator() {
    Iterator<List<IN>> iter = new WrappedIterator(wrapped.iterator());
    return iter;
  }

  private class WrappedIterator extends AbstractIterator<List<IN>> {
    Iterator<List<IN>> wrappedIter;
    Iterator<List<IN>> spilloverIter;

    public WrappedIterator(Iterator<List<IN>> wrappedIter) {
      this.wrappedIter = wrappedIter;
    }

    @Override
    public boolean hasNext() {
      while ((spilloverIter == null || !spilloverIter.hasNext()) &&
             wrappedIter.hasNext()) {
        List<IN> doc = wrappedIter.next();
        List<List<IN>> docs = new ArrayList<>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }
      return wrappedIter.hasNext() ||
        (spilloverIter != null && spilloverIter.hasNext());
    }

    @Override
    public List<IN> next() {
      // this while loop now is redundant because it should
      // have already been done in "hasNext".
      // I'm keeping it so that the diff is minimal.
      // -pichuan
      while (spilloverIter == null || !spilloverIter.hasNext()) {
        List<IN> doc = wrappedIter.next();
        List<List<IN>> docs = new ArrayList<>();
        docs.add(doc);
        fixDocLengths(docs);
        spilloverIter = docs.iterator();
      }

      return processDocument(spilloverIter.next());
    }
  }

  public List<IN> processDocument(List<IN> doc) {
    if (flags.mergeTags) { mergeTags(doc); }
    if (flags.iobTags) { iobTags(doc); }
    doBasicStuff(doc);

    return doc;
  }

  private String intern(String s) {
    if (flags.intern) {
      return s.intern();
    } else {
      return s;
    }
  }


  private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE);

  private String fix(String word) {
    if (flags.normalizeTerms || flags.normalizeTimex) {
      // Same case for days/months: map to lowercase
      if (monthDayPattern.matcher(word).matches()) {
        return word.toLowerCase();
      }
    }
    if (flags.normalizeTerms) {
      return Americanize.americanize(word, false);
    }
    return word;
  }


  private void doBasicStuff(List<IN> doc) {
    int position = 0;
    for (IN fl : doc) {

      // position in document
      fl.set(CoreAnnotations.PositionAnnotation.class, Integer.toString((position++)));

      // word shape
      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) {
        // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment,
        // we should use that FeatureFactory's getWord function
        String word = fl.get(CoreAnnotations.TextAnnotation.class);
        if (flags.wordFunction != null) {
          word = flags.wordFunction.apply(word);
        }
        if ( ! word.isEmpty() && Character.isLowerCase(word.codePointAt(0))) {
          knownLCWords.add(word);
        }

        String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords));
        fl.set(CoreAnnotations.ShapeAnnotation.class, s);
      }

      // normalizing and interning
      // was the following; should presumably now be
      // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) {
      if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) {
        // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency.
        fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class))));
      } else {
        fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class))));
        // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place.
        if (fl.get(CoreAnnotations.AnswerAnnotation.class) == null) {
          fl.set(CoreAnnotations.GoldAnswerAnnotation.class, fl.get(CoreAnnotations.AnswerAnnotation.class));
        }
      }
    }
  }

  /**
   * Take a {@link List} of documents (which are themselves {@link List}s
   * of something that extends {@link CoreMap}, CoreLabel by default),
   * and if any are longer than the length
   * specified by flags.maxDocSize split them up.  If maxDocSize is negative,
   * nothing is changed.  In practice, documents need to be not too long or
   * else the CRF inference will fail due to numerical problems.
   * This method tries to be smart
   * and split on sentence boundaries, but this is hard-coded to English.
   *
   * @param docs The list of documents whose length might be adjusted.
   */
  private void fixDocLengths(List<List<IN>> docs) {
    final int maxDocSize = flags.maxDocSize;

    WordToSentenceProcessor<IN> wts = new WordToSentenceProcessor<>();
    List<List<IN>> newDocuments = new ArrayList<>();
    for (List<IN> document : docs) {
      if (maxDocSize <= 0 || document.size() <= maxDocSize) {
        if (flags.keepEmptySentences || !document.isEmpty()) {
          newDocuments.add(document);
        }
        continue;
      }
      List<List<IN>> sentences = wts.process(document);
      List<IN> newDocument = new ArrayList<>();
      for (List<IN> sentence : sentences) {
        if (newDocument.size() + sentence.size() > maxDocSize) {
          if (!newDocument.isEmpty()) {
            newDocuments.add(newDocument);
          }
          newDocument = new ArrayList<>();
        }
        newDocument.addAll(sentence);
      }
      if (flags.keepEmptySentences || !newDocument.isEmpty()) {
        newDocuments.add(newDocument);
      }
    }

    docs.clear();
    docs.addAll(newDocuments);
  }

  private void iobTags(List<IN> doc) {
    String lastTag = "";
    for (IN wi : doc) {
      String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
      if (!flags.backgroundSymbol.equals(answer) && answer != null) {
        int index = answer.indexOf('-');
        String prefix;
        String label;
        if (index < 0) {
          prefix = "";
          label = answer;
        } else {
          prefix = answer.substring(0,index);
          label = answer.substring(index+1);
        }

        if (!prefix.equals("B")) {
          if (!label.equals(lastTag)) {
            wi.set(CoreAnnotations.AnswerAnnotation.class, "B-" + label);
          } else {
            wi.set(CoreAnnotations.AnswerAnnotation.class, "I-" + label);
          }
        }
        lastTag = label;
      } else {
        lastTag = answer;
      }
    }
  }

  /** Change some form of IOB/IOE encoding via forms like "I-PERS" to
   *  IO encoding as just "PERS".
   *
   *  @param doc The document for which the AnswerAnnotation will be changed (in place)
   */
  private void mergeTags(List<IN> doc) {
    for (IN wi : doc) {
      String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
      if (answer == null) {
        continue;
      }
      if ( ! answer.equals(flags.backgroundSymbol)) {
        int index = answer.indexOf('-');
        if (index >= 0) {
          answer = answer.substring(index + 1);
        }
      }
      wi.set(CoreAnnotations.AnswerAnnotation.class, answer);
    }
  }


  // all the other the crap from ObjectBank
  @Override
  public boolean add(List<IN> o) { return wrapped.add(o); }
  @Override
  public boolean addAll(Collection<? extends List<IN>> c) { return wrapped.addAll(c); }
  @Override
  public void clear() { wrapped.clear(); }
  @Override
  public void clearMemory() { wrapped.clearMemory(); }
  public boolean contains(List<IN> o) { return wrapped.contains(o); }
  @Override
  public boolean containsAll(Collection<?> c) { return wrapped.containsAll(c); }
  @Override
  public boolean isEmpty() { return wrapped.isEmpty(); }
  @Override
  public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); }
  public boolean remove(List<IN> o) { return wrapped.remove(o); }
  @Override
  public boolean removeAll(Collection<?> c) { return wrapped.removeAll(c); }
  @Override
  public boolean retainAll(Collection<?> c) { return wrapped.retainAll(c); }
  @Override
  public int size() { return wrapped.size(); }
  @Override
  public Object[] toArray() { return wrapped.toArray(); }
  public List<IN>[] toArray(List<IN>[] o) { return wrapped.toArray(o); }

} // end class ObjectBankWrapper