package edu.stanford.nlp.pipeline;
import java.util.*;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
/**
* This class assumes that there is a {@code List<CoreLabel>}
* under the {@code TokensAnnotation} field, and runs it
* through {@link edu.stanford.nlp.process.WordToSentenceProcessor}
* and puts the new {@code List<Annotation>}
* under the {@code SentencesAnnotation} field.
*
* @author Jenny Finkel
* @author Christopher Manning
*/
public class WordsToSentencesAnnotator implements Annotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(WordsToSentencesAnnotator.class);
private final WordToSentenceProcessor<CoreLabel> wts;
private final boolean VERBOSE;
private final boolean countLineNumbers;
public WordsToSentencesAnnotator() {
this(false);
}
public WordsToSentencesAnnotator(Properties properties) {
// log.info(signature());
// todo: The above shows that signature is edu.stanford.nlp.pipeline.AnnotatorImplementations: and doesn't reflect what annotator it is! Should fix. Maybe is fixed now [2016]. Test!
boolean nlSplitting = Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
if (nlSplitting) {
boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
if (whitespaceTokenization) {
if (System.lineSeparator().equals("\n")) {
// this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts1 =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{"\n"}));
VERBOSE = false;
this.countLineNumbers = true;
this.wts = wts1;
} else {
// throw "\n" in just in case files use that instead of
// the system separator
// this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts1 =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{System.lineSeparator(), "\n"}));
VERBOSE = false;
this.countLineNumbers = true;
this.wts = wts1;
}
} else {
// this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts1 =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(new String[]{PTBTokenizer.getNewlineToken()}));
VERBOSE = false;
this.countLineNumbers = true;
this.wts = wts1;
}
} else {
// Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
String isOneSentence = properties.getProperty("ssplit.isOneSentence");
if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
WordToSentenceProcessor<CoreLabel> wts1 = new WordToSentenceProcessor<>(true);
VERBOSE = false;
this.countLineNumbers = false;
this.wts = wts1;
} else {
// multi token sentence boundaries
String boundaryMultiTokenRegex = properties.getProperty("ssplit.boundaryMultiTokenRegex");
// Discard these tokens without marking them as sentence boundaries
String tokenPatternsToDiscardProp = properties.getProperty("ssplit.tokenPatternsToDiscard");
Set<String> tokenRegexesToDiscard = null;
if (tokenPatternsToDiscardProp != null) {
String[] toks = tokenPatternsToDiscardProp.split(",");
tokenRegexesToDiscard = Generics.newHashSet(Arrays.asList(toks));
}
// regular boundaries
String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
Set<String> boundariesToDiscard = null;
// todo [cdm 2016]: Add support for specifying ssplit.boundaryFollowerRegex here and send down to WordsToSentencesAnnotator
// newline boundaries which are discarded.
String bounds = properties.getProperty("ssplit.boundariesToDiscard");
if (bounds != null) {
String[] toks = bounds.split(",");
boundariesToDiscard = Generics.newHashSet(Arrays.asList(toks));
}
Set<String> htmlElementsToDiscard = null;
// HTML boundaries which are discarded
bounds = properties.getProperty("ssplit.htmlBoundariesToDiscard");
if (bounds != null) {
String[] elements = bounds.split(",");
htmlElementsToDiscard = Generics.newHashSet(Arrays.asList(elements));
}
String nlsb = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY,
StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK);
WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(boundaryTokenRegex, null,
boundariesToDiscard, htmlElementsToDiscard,
WordToSentenceProcessor.stringToNewlineIsSentenceBreak(nlsb),
(boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
VERBOSE = false;
this.countLineNumbers = false;
this.wts = wts;
}
}
}
public WordsToSentencesAnnotator(boolean verbose) {
this(verbose, false, new WordToSentenceProcessor<>());
}
public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard,
String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
Set<String> tokenRegexesToDiscard) {
this(verbose, false,
new WordToSentenceProcessor<>(boundaryTokenRegex, null,
boundaryToDiscard, htmlElementsToDiscard,
WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
(boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard));
}
private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
WordToSentenceProcessor<CoreLabel> wts) {
VERBOSE = verbose;
this.countLineNumbers = countLineNumbers;
this.wts = wts;
}
/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
* This constructor counts the lines by putting in empty token lists for empty lines.
* It tells the underlying splitter to return empty lists of tokens
* and then treats those empty lists as empty lines. We don't
* actually include empty sentences in the annotation, though. But they
* are used in numbering the sentence. Only this constructor leads to
* empty sentences.
*
* @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
* newline tokens returned from the tokenizer.
* @return A WordsToSentenceAnnotator.
*/
public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
// this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
return new WordsToSentencesAnnotator(false, true, wts);
}
/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
*
* @return A WordsToSentenceAnnotator.
*/
public static WordsToSentencesAnnotator nonSplitter() {
WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
return new WordsToSentencesAnnotator(false, false, wts);
}
/**
* If setCountLineNumbers is set to true, we count line numbers by
* telling the underlying splitter to return empty lists of tokens
* and then treating those empty lists as empty lines. We don't
* actually include empty sentences in the annotation, though.
**/
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Sentence splitting ...");
}
if ( !annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
}
// get text and tokens from the document
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
// log.info("Tokens are: " + tokens);
// assemble the sentence annotations
int tokenOffset = 0;
int lineNumber = 0;
// section annotations to mark sentences with
CoreMap sectionAnnotations = null;
List<CoreMap> sentences = new ArrayList<>();
// keep track of current section to assign sentences to sections
int currSectionIndex = 0;
List<CoreMap> sections = annotation.get(CoreAnnotations.SectionsAnnotation.class);
for (List<CoreLabel> sentenceTokens: wts.process(tokens)) {
if (countLineNumbers) {
++lineNumber;
}
if (sentenceTokens.isEmpty()) {
if (!countLineNumbers) {
throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
} else {
continue;
}
}
// get the sentence text from the first and last character offsets
int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int last = sentenceTokens.size() - 1;
int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String sentenceText = text.substring(begin, end);
// create a sentence annotation with text and token offsets
Annotation sentence = new Annotation(sentenceText);
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
tokenOffset += sentenceTokens.size();
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
if (countLineNumbers) {
sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
}
// Annotate sentence with section information.
// Assume section start and end appear as first and last tokens of sentence
CoreLabel sentenceStartToken = sentenceTokens.get(0);
CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size()-1);
CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
if (sectionStart != null) {
// Section is started
sectionAnnotations = sectionStart;
}
if (sectionAnnotations != null) {
// transfer annotations over to sentence
ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
}
String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
if (sectionEnd != null) {
sectionAnnotations = null;
}
// determine section index for this sentence if keeping track of sections
// try to find a section that ends after this sentence ends, check if it encloses sentence
// if it doesn't, that means this sentence is in two section
while (currSectionIndex < sections.size()) {
int currSectionCharBegin = sections.get(currSectionIndex).get(
CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int currSectionCharEnd = sections.get(currSectionIndex).get(
CoreAnnotations.CharacterOffsetEndAnnotation.class);
if (currSectionCharEnd < end) {
currSectionIndex++;
continue;
} else {
// check if this sentence fits in the candidate section
if (currSectionCharBegin <= begin) {
// add sentence to section list
sections.get(currSectionIndex).get(CoreAnnotations.SentencesAnnotation.class).add(sentence);
// set sentence's section date
String sectionDate = sections.get(currSectionIndex).get(CoreAnnotations.SectionDateAnnotation.class);
sentence.set(CoreAnnotations.SectionDateAnnotation.class, sectionDate);
// set section index
sentence.set(CoreAnnotations.SectionIndexAnnotation.class, currSectionIndex);
}
break;
}
}
if (docID != null) {
sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
}
int index = 1;
for (CoreLabel token : sentenceTokens) {
token.setIndex(index++);
token.setSentIndex(sentences.size());
if (docID != null) {
token.setDocID(docID);
}
}
// add the sentence to the list
sentences.add(sentence);
}
// the condition below is possible if sentenceBoundaryToDiscard is initialized!
/*
if (tokenOffset != tokens.size()) {
throw new RuntimeException(String.format(
"expected %d tokens, found %d", tokens.size(), tokenOffset));
}
*/
// add the sentences annotations to the document
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class
)));
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class
));
}
}