package edu.stanford.nlp.paragraphs;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Grace Muzny
*/
public class ParagraphAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(ParagraphAnnotator.class);
private final boolean VERBOSE;
private final boolean DEBUG = true;
// Whether or not to allow quotes of the same type embedded inside of each other
// ["one" | "two"]
public String PARAGRAPH_BREAK = "two";
public ParagraphAnnotator(Properties props, boolean verbose) {
PARAGRAPH_BREAK = props.getProperty("paragraphBreak", "two");
VERBOSE = verbose;
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
System.err.print("Adding paragraph index annotation (" + PARAGRAPH_BREAK + ") ...");
}
Pattern paragraphSplit = null;
if (PARAGRAPH_BREAK.equals("two")) {
paragraphSplit = Pattern.compile("\\n\\n+");
} else if (PARAGRAPH_BREAK.equals("one")) {
paragraphSplit = Pattern.compile("\\n+");
}
String fullText = annotation.get(CoreAnnotations.TextAnnotation.class);
Matcher m = paragraphSplit.matcher(fullText);
List<Integer> paragraphBreaks = Generics.newArrayList();
while (m.find()) {
// get the staring index
paragraphBreaks.add(m.start());
}
// each sentence gets a paragraph id annotation
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
int currParagraph = -1;
int nextParagraphStartIndex = -1;
for (CoreMap sent : sentences) {
int sentBegin = sent.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
if (sentBegin >= nextParagraphStartIndex) {
if (currParagraph + 1 < paragraphBreaks.size()) {
nextParagraphStartIndex = paragraphBreaks.get(currParagraph + 1);
} else {
nextParagraphStartIndex = fullText.length();
}
currParagraph++;
}
sent.set(CoreAnnotations.ParagraphIndexAnnotation.class, currParagraph);
}
if (VERBOSE) {
System.err.println("done");
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(CoreAnnotations.ParagraphIndexAnnotation.class);
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class
));
}
}