WordsToSentencesAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import java.util.*;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;


/**
 * This class assumes that there is a {@code List<CoreLabel>}
 * under the {@code TokensAnnotation} field, and runs it
 * through {@link edu.stanford.nlp.process.WordToSentenceProcessor}
 * and puts the new {@code List<Annotation>}
 * under the {@code SentencesAnnotation} field.
 *
 * @author Jenny Finkel
 * @author Christopher Manning
 */
public class WordsToSentencesAnnotator implements Annotator  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(WordsToSentencesAnnotator.class);

  private final WordToSentenceProcessor<CoreLabel> wts;

  private final boolean VERBOSE;

  private final boolean countLineNumbers;

  public WordsToSentencesAnnotator() {
    this(false);
  }


  public WordsToSentencesAnnotator(Properties properties) {
    // log.info(signature());
    // todo: The above shows that signature is edu.stanford.nlp.pipeline.AnnotatorImplementations: and doesn't reflect what annotator it is! Should fix. Maybe is fixed now [2016]. Test!
    boolean nlSplitting = Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
    if (nlSplitting) {
      boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
      if (whitespaceTokenization) {
        if (System.lineSeparator().equals("\n")) {
          // this constructor will keep empty lines as empty sentences
          WordToSentenceProcessor<CoreLabel> wts1 =
                  new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{"\n"}));
          VERBOSE = false;
          this.countLineNumbers = true;
          this.wts = wts1;
        } else {
          // throw "\n" in just in case files use that instead of
          // the system separator
          // this constructor will keep empty lines as empty sentences
          WordToSentenceProcessor<CoreLabel> wts1 =
                  new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{System.lineSeparator(), "\n"}));
          VERBOSE = false;
          this.countLineNumbers = true;
          this.wts = wts1;
        }
      } else {
        // this constructor will keep empty lines as empty sentences
        WordToSentenceProcessor<CoreLabel> wts1 =
                new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{PTBTokenizer.getNewlineToken()}));
        VERBOSE = false;
        this.countLineNumbers = true;
        this.wts = wts1;
      }

    } else {
      // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
      String isOneSentence = properties.getProperty("ssplit.isOneSentence");
      if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
        WordToSentenceProcessor<CoreLabel> wts1 = new WordToSentenceProcessor<>(true);
        VERBOSE = false;
        this.countLineNumbers = false;
        this.wts = wts1;
      } else {

        // multi token sentence boundaries
        String boundaryMultiTokenRegex = properties.getProperty("ssplit.boundaryMultiTokenRegex");

        // Discard these tokens without marking them as sentence boundaries
        String tokenPatternsToDiscardProp = properties.getProperty("ssplit.tokenPatternsToDiscard");
        Set<String> tokenRegexesToDiscard = null;
        if (tokenPatternsToDiscardProp != null) {
          String[] toks = tokenPatternsToDiscardProp.split(",");
          tokenRegexesToDiscard = Generics.newHashSet(Arrays.asList(toks));
        }
        // regular boundaries
        String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
        Set<String> boundariesToDiscard = null;

        // todo [cdm 2016]: Add support for specifying ssplit.boundaryFollowerRegex here and send down to WordsToSentencesAnnotator

        // newline boundaries which are discarded.
        String bounds = properties.getProperty("ssplit.boundariesToDiscard");
        if (bounds != null) {
          String[] toks = bounds.split(",");
          boundariesToDiscard = Generics.newHashSet(Arrays.asList(toks));
        }
        Set<String> htmlElementsToDiscard = null;
        // HTML boundaries which are discarded
        bounds = properties.getProperty("ssplit.htmlBoundariesToDiscard");
        if (bounds != null) {
          String[] elements = bounds.split(",");
          htmlElementsToDiscard = Generics.newHashSet(Arrays.asList(elements));
        }
        String nlsb = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY,
            StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK);

        WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(boundaryTokenRegex, null,
            boundariesToDiscard, htmlElementsToDiscard,
            WordToSentenceProcessor.stringToNewlineIsSentenceBreak(nlsb),
            (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
        VERBOSE = false;
        this.countLineNumbers = false;
        this.wts = wts;
      }
    }
  }

  public WordsToSentencesAnnotator(boolean verbose) {
    this(verbose, false, new WordToSentenceProcessor<>());
  }

  public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
                                   Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard,
                                   String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
                                   Set<String> tokenRegexesToDiscard) {
    this(verbose, false,
            new WordToSentenceProcessor<>(boundaryTokenRegex, null,
                    boundaryToDiscard, htmlElementsToDiscard,
                    WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
                    (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard));
  }

  private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
                                    WordToSentenceProcessor<CoreLabel> wts) {
    VERBOSE = verbose;
    this.countLineNumbers = countLineNumbers;
    this.wts = wts;
  }


  /** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
   *  This constructor counts the lines by putting in empty token lists for empty lines.
   *  It tells the underlying splitter to return empty lists of tokens
   *  and then treats those empty lists as empty lines.  We don't
   *  actually include empty sentences in the annotation, though. But they
   *  are used in numbering the sentence. Only this constructor leads to
   *  empty sentences.
   *
   *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
   *                 newline tokens returned from the tokenizer.
   *  @return A WordsToSentenceAnnotator.
   */
  public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
    // this constructor will keep empty lines as empty sentences
    WordToSentenceProcessor<CoreLabel> wts =
            new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
    return new WordsToSentencesAnnotator(false, true, wts);
  }


  /** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
   *
   *  @return A WordsToSentenceAnnotator.
   */
  public static WordsToSentencesAnnotator nonSplitter() {
    WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
    return new WordsToSentencesAnnotator(false, false, wts);
  }


  /**
   * If setCountLineNumbers is set to true, we count line numbers by
   * telling the underlying splitter to return empty lists of tokens
   * and then treating those empty lists as empty lines.  We don't
   * actually include empty sentences in the annotation, though.
   **/
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Sentence splitting ...");
    }
    if ( !annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
      throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }

    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // log.info("Tokens are: " + tokens);

    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<>();
    // keep track of current section to assign sentences to sections
    int currSectionIndex = 0;
    List<CoreMap> sections = annotation.get(CoreAnnotations.SectionsAnnotation.class);
    for (List<CoreLabel> sentenceTokens: wts.process(tokens)) {
      if (countLineNumbers) {
        ++lineNumber;
      }
      if (sentenceTokens.isEmpty()) {
        if (!countLineNumbers) {
          throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
        } else {
          continue;
        }
      }

      // get the sentence text from the first and last character offsets
      int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int last = sentenceTokens.size() - 1;
      int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      String sentenceText = text.substring(begin, end);

      // create a sentence annotation with text and token offsets
      Annotation sentence = new Annotation(sentenceText);
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
      sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      tokenOffset += sentenceTokens.size();
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());

      if (countLineNumbers) {
        sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
      }

      // Annotate sentence with section information.
      // Assume section start and end appear as first and last tokens of sentence
      CoreLabel sentenceStartToken = sentenceTokens.get(0);
      CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size()-1);

      CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
      if (sectionStart != null) {
        // Section is started
        sectionAnnotations = sectionStart;
      }
      if (sectionAnnotations != null) {
        // transfer annotations over to sentence
        ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
      }
      String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
      if (sectionEnd != null) {
        sectionAnnotations = null;
      }

      // determine section index for this sentence if keeping track of sections
      // try to find a section that ends after this sentence ends, check if it encloses sentence
      // if it doesn't, that means this sentence is in two section
      while (currSectionIndex < sections.size()) {
        int currSectionCharBegin = sections.get(currSectionIndex).get(
              CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int currSectionCharEnd = sections.get(currSectionIndex).get(
              CoreAnnotations.CharacterOffsetEndAnnotation.class);
        if (currSectionCharEnd < end) {
          currSectionIndex++;
          continue;
        } else {
          // check if this sentence fits in the candidate section
          if (currSectionCharBegin <= begin) {
            // add sentence to section list
            sections.get(currSectionIndex).get(CoreAnnotations.SentencesAnnotation.class).add(sentence);
            // set sentence's section date
            String sectionDate = sections.get(currSectionIndex).get(CoreAnnotations.SectionDateAnnotation.class);
            sentence.set(CoreAnnotations.SectionDateAnnotation.class, sectionDate);
            // set section index
            sentence.set(CoreAnnotations.SectionIndexAnnotation.class, currSectionIndex);
          }
          break;
        }
      }

      if (docID != null) {
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
      }

      int index = 1;
      for (CoreLabel token : sentenceTokens) {
        token.setIndex(index++);
        token.setSentIndex(sentences.size());
        if (docID != null) {
          token.setDocID(docID);
        }
      }

      // add the sentence to the list
      sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
      /*
      if (tokenOffset != tokens.size()) {
        throw new RuntimeException(String.format(
            "expected %d tokens, found %d", tokens.size(), tokenOffset));
      }
      */

    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }


  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class
    )));
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return new HashSet<>(Arrays.asList(
        CoreAnnotations.SentencesAnnotation.class,
        CoreAnnotations.SentenceIndexAnnotation.class
    ));
  }

}