package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.parser.common.ParserUtils;
import edu.stanford.nlp.parser.charniak.CharniakParser;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
/**
* This class will add parse information to an Annotation from the BLLIP parser.
* It allows you to use the Charniak parser or Charniak and Johnson reranking parser
* along with any existing parser and reranking model.
*
* It assumes that the Annotation already contains the tokenized words
* as a {@code List<List<CoreLabel>>} under
* {@code CoreAnnotations.SentencesAnnotation.class}.
* If the words have POS tags, they will not be used.
*
* @author David McClosky
*/
public class CharniakParserAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(CharniakParserAnnotator.class);
// TODO: make this an option?
private static final boolean BUILD_GRAPHS = true;
private final GrammaticalStructureFactory gsf = new EnglishGrammaticalStructureFactory();
private final boolean VERBOSE;
private final CharniakParser parser;
public CharniakParserAnnotator(String parserModel, String parserExecutable, boolean verbose, int maxSentenceLength) {
VERBOSE = verbose;
parser = new CharniakParser(parserExecutable, parserModel);
parser.setMaxSentenceLength(maxSentenceLength);
}
public CharniakParserAnnotator() {
VERBOSE = false;
parser = new CharniakParser();
}
@Override
public void annotate(Annotation annotation) {
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// parse a tree for each sentence
for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
if (VERBOSE) {
log.info("Parsing: " + words);
}
int maxSentenceLength = parser.getMaxSentenceLength();
// generate the constituent tree
Tree tree; // initialized below
if (maxSentenceLength <= 0 || words.size() < maxSentenceLength) {
tree = parser.getBestParse(words);
}
else {
tree = ParserUtils.xTree(words);
}
List<Tree> trees = Generics.newArrayList(1);
trees.add(tree);
ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, GrammaticalStructure.Extras.NONE);
}
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.SentencesAnnotation.class
)));
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.PartOfSpeechAnnotation.class,
TreeCoreAnnotations.TreeAnnotation.class,
CoreAnnotations.CategoryAnnotation.class
)));
}
}