//
// StanfordCoreNLP -- a suite of NLP tools.
// Copyright (c) 2009-2011 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
//
package edu.stanford.nlp.pipeline;
import hu.u_szeged.nlp.pos.MagyarlancResourceHolder;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.ReflectionLoading;
/**
* This is a pipeline that takes in a string and returns various analyzed linguistic forms. The String is
* tokenized via a tokenizer (such as PTBTokenizerAnnotator), and then other sequence model style annotation
* can be used to add things like lemmas, POS tags, and named entities. These are returned as a list of
* CoreLabels. Other analysis components build and store parse trees, dependency graphs, etc.
* <p>
* This class is designed to apply multiple Annotators to an Annotation. The idea is that you first build up
* the pipeline by adding Annotators, and then you take the objects you wish to annotate and pass them in and
* get in return a fully annotated object. At the command-line level you can, e.g., tokenize text with
* StanfordCoreNLP with a command like: <br/>
*
* <pre>
* java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit -file document.txt
* </pre>
*
* <br/>
* Please see the package level javadoc for sample usage and a more complete description.
* <p>
* The main entry point for the API is StanfordCoreNLP.process() .
* <p>
* <i>Implementation note:</i> There are other annotation pipelines, but they don't extend this one. Look for
* classes that implement Annotator and which have "Pipeline" in their name.
*
* @author Jenny Finkel
* @author Anna Rafferty
* @author Christopher Manning
* @author Mihai Surdeanu
* @author Steven Bethard
*/
public class SzTECoreNLP extends SzTEAnnotationPipeline {
// EXTENSION
public static String lang = null;
/*
* List of all known annotator property names Add new annotators and/or annotators from other groups here!
*/
private static final String NORMALIZATION = "normalize";
private static final String STOPWORD_CHECK = "stopword";
private static final String MWE = "mwe";
// --> EXTENSION
// other constants
public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass.";
private static final String PROPS_SUFFIX = ".properties";
public static final String NEWLINE_SPLITTER_PROPERTY = "ssplit.eolonly";
public static final String NEWLINE_IS_SENTENCE_BREAK_PROPERTY = "ssplit.newlineIsSentenceBreak";
public static final String DEFAULT_NEWLINE_IS_SENTENCE_BREAK = "two";
/** Stores the overall number of words processed */
private int numWords;
/** Maintains the shared pool of annotators */
public static AnnotatorPool pool = null;
private Properties properties;
/**
* Constructs a pipeline using as properties the properties file found in the classpath
*/
public SzTECoreNLP() {
this((Properties) null);
}
/**
* Construct a basic pipeline. The Properties will be used to determine which annotators to create, and a
* default AnnotatorPool will be used to create the annotators.
*
*/
public SzTECoreNLP(Properties props) {
this(props, (props == null || PropertiesUtils.getBool(props, "enforceRequirements", true)));
}
public SzTECoreNLP(Properties props, boolean enforceRequirements) {
construct(props, enforceRequirements);
}
/**
* Constructs a pipeline with the properties read from this file, which must be found in the classpath
*
* @param propsFileNamePrefix
*/
public SzTECoreNLP(String propsFileNamePrefix) {
this(propsFileNamePrefix, true);
}
public SzTECoreNLP(String propsFileNamePrefix, boolean enforceRequirements) {
Properties props = loadProperties(propsFileNamePrefix);
if (props == null) {
throw new RuntimeIOException("ERROR: cannot find properties file \"" + propsFileNamePrefix
+ "\" in the classpath!");
}
construct(props, enforceRequirements);
}
//
// property-specific methods
//
private static String getRequiredProperty(Properties props, String name) {
String val = props.getProperty(name);
if (val == null) {
System.err.println("Missing property \"" + name + "\"!");
printRequiredProperties(System.err);
throw new RuntimeException("Missing property: \"" + name + '\"');
}
return val;
}
/**
* Finds the properties file in the classpath and loads the properties from there.
*
* @return The found properties object (must be not-null)
* @throws RuntimeException
* If no properties file can be found on the classpath
*/
private static Properties loadPropertiesFromClasspath() {
List<String> validNames = Arrays.asList("StanfordCoreNLP", "edu.stanford.nlp.pipeline.StanfordCoreNLP");
for (String name : validNames) {
Properties props = loadProperties(name);
if (props != null)
return props;
}
throw new RuntimeException("ERROR: Could not find properties file in the classpath!");
}
private static Properties loadProperties(String name) {
return loadProperties(name, Thread.currentThread().getContextClassLoader());
}
private static Properties loadProperties(String name, ClassLoader loader) {
if (name.endsWith(PROPS_SUFFIX))
name = name.substring(0, name.length() - PROPS_SUFFIX.length());
name = name.replace('.', '/');
name += PROPS_SUFFIX;
Properties result = null;
// Returns null on lookup failures
System.err.println("Searching for resource: " + name);
InputStream in = loader.getResourceAsStream(name);
try {
if (in != null) {
InputStreamReader reader = new InputStreamReader(in, "utf-8");
result = new Properties();
result.load(reader); // Can throw IOException
}
} catch (IOException e) {
result = null;
} finally {
IOUtils.closeIgnoringExceptions(in);
}
return result;
}
/** Fetches the Properties object used to construct this Annotator */
public Properties getProperties() {
return properties;
}
public String getEncoding() {
return properties.getProperty("encoding", "UTF-8");
}
//
// AnnotatorPool construction support
//
private void construct(Properties props, boolean enforceRequirements) {
this.numWords = 0;
if (props == null) {
// if undefined, find the properties file in the classpath
props = loadPropertiesFromClasspath();
} else if (props.getProperty("annotators") == null) {
// this happens when some command line options are specified (e.g just "-filelist") but no properties
// file is.
// we use the options that are given and let them override the default properties from the class path
// properties.
Properties fromClassPath = loadPropertiesFromClasspath();
fromClassPath.putAll(props);
props = fromClassPath;
}
// EXTENSION
lang = props.getProperty("lang", "en");
if (!props.containsKey("pos.model")) {
if (lang.equals("hu")) {
MagyarlancResourceHolder.initCorpus(System.getProperty("user.dir") + "/resources/magyarlanc/szeged_2_3.lex");
MagyarlancResourceHolder.initFrequencies(System.getProperty("user.dir")
+ "/resources/magyarlanc/szeged_2_3.freq");
MagyarlancResourceHolder.initRFSA(System.getProperty("user.dir") + "/resources/magyarlanc/rfsa.txt");
MagyarlancResourceHolder.initCorrDic(System.getProperty("user.dir") + "/resources/magyarlanc/corrdic.txt");
MagyarlancResourceHolder.initMorPhonDir();
MagyarlancResourceHolder.initMSDReducer();
MagyarlancResourceHolder.initKRToMSD();
props.put("pos.model", System.getProperty("user.dir") + "/resources/magyarlanc/szeged_2_3.model");
} else {
props.put("pos.model", DefaultPaths.DEFAULT_POS_MODEL);
}
}
// -->EXTENSION
this.properties = props;
AnnotatorPool pool = getDefaultAnnotatorPool(props);
// now construct the annotators from the given properties in the given order
List<String> annoNames = Arrays.asList(getRequiredProperty(props, "annotators").split("[, \t]+"));
Set<String> alreadyAddedAnnoNames = new HashSet<String>();
Set<Requirement> requirementsSatisfied = new HashSet<Requirement>();
for (String name : annoNames) {
name = name.trim();
if (name.isEmpty()) {
continue;
}
System.err.println("Adding annotator " + name);
Annotator an = pool.get(name);
this.addAnnotator(an);
if (enforceRequirements) {
Set<Requirement> allRequirements = an.requires();
for (Requirement requirement : allRequirements) {
if (!requirementsSatisfied.contains(requirement)) {
String fmt = "annotator \"%s\" requires annotator \"%s\"";
throw new IllegalArgumentException(String.format(fmt, name, requirement));
}
}
requirementsSatisfied.addAll(an.requirementsSatisfied());
}
alreadyAddedAnnoNames.add(name);
}
// Sanity check
if (!alreadyAddedAnnoNames.contains(STANFORD_SSPLIT)) {
System.setProperty(NEWLINE_SPLITTER_PROPERTY, "false");
}
}
/**
* Call this if you are no longer using StanfordCoreNLP and want to release the memory associated with the
* annotators.
*/
public static synchronized void clearAnnotatorPool() {
pool = null;
}
private static synchronized AnnotatorPool getDefaultAnnotatorPool(final Properties inputProps) {
// if the pool already exists reuse!
if (pool == null) {
// first time we get here
pool = new AnnotatorPool();
}
//
// tokenizer: breaks text into a sequence of tokens
// this is required for all following annotators!
//
pool.register(STANFORD_TOKENIZE, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
if (Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"))) {
return new WhitespaceTokenizerAnnotator(properties);
} else {
String options = properties.getProperty("tokenize.options", PTBTokenizerAnnotator.DEFAULT_OPTIONS);
boolean keepNewline = Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"));
// If they
if (properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY) != null) {
keepNewline = true;
}
// If the user specifies "tokenizeNLs=false" in tokenize.options, then this default will
// be overridden.
if (keepNewline) {
options = "tokenizeNLs," + options;
}
// EXTENSION
if (lang.equals("hu")) {
return new HunTokenizerAnnotator(false, options);
} else {
return new PTBTokenizerAnnotator(false, options);
}
// return new PTBTokenizerAnnotator(false, options);
// -->EXTENSION
}
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
StringBuilder os = new StringBuilder();
os.append("tokenize.whitespace:" + properties.getProperty("tokenize.whitespace", "false"));
if (Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"))) {
os.append(WhitespaceTokenizerAnnotator.EOL_PROPERTY + ":"
+ properties.getProperty(WhitespaceTokenizerAnnotator.EOL_PROPERTY, "false"));
os.append(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY + ":"
+ properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
return os.toString();
} else {
os.append(NEWLINE_SPLITTER_PROPERTY + ":"
+ Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false")));
os.append(NEWLINE_IS_SENTENCE_BREAK_PROPERTY + ":"
+ properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK));
}
return os.toString();
}
});
pool.register(STANFORD_CLEAN_XML, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
String xmlTags = properties.getProperty("clean.xmltags", CleanXmlAnnotator.DEFAULT_XML_TAGS);
String sentenceEndingTags = properties.getProperty("clean.sentenceendingtags",
CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS);
// String singleSentenceTags = properties.getProperty("clean.singlesentencetags",
// CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS);
// String allowFlawedString = properties.getProperty("clean.allowflawedxml");
// boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS;
// if (allowFlawedString != null)
// allowFlawed = Boolean.valueOf(allowFlawedString);
String dateTags = properties.getProperty("clean.datetags", CleanXmlAnnotator.DEFAULT_DATE_TAGS);
// String docIdTags = properties.getProperty("clean.docIdtags", CleanXmlAnnotator.DEFAULT_DOCID_TAGS);
// String docTypeTags = properties.getProperty("clean.docTypetags",
// CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS);
// String utteranceTurnTags = properties.getProperty("clean.turntags",
// CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS);
// String speakerTags = properties.getProperty("clean.speakertags",
// CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS);
// String docAnnotations = properties.getProperty("clean.docAnnotations",
// CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS);
// String tokenAnnotations = properties.getProperty("clean.tokenAnnotations",
// CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS);
// String sectionTags = properties.getProperty("clean.sectiontags",
// CleanXmlAnnotator.DEFAULT_SECTION_TAGS);
// String sectionAnnotations = properties.getProperty("clean.sectionAnnotations",
// CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS);
// String ssplitDiscardTokens = properties.getProperty("clean.ssplitDiscardTokens");
MyCleanXmlAnnotator annotator = new MyCleanXmlAnnotator(xmlTags, sentenceEndingTags, dateTags);
// annotator.setSingleSentenceTagMatcher(singleSentenceTags);
// annotator.setDocIdTagMatcher(docIdTags);
// annotator.setDocTypeTagMatcher(docTypeTags);
// annotator.setDiscourseTags(utteranceTurnTags, speakerTags);
// annotator.setDocAnnotationPatterns(docAnnotations);
// annotator.setTokenAnnotationPatterns(tokenAnnotations);
// annotator.setSectionTagMatcher(sectionTags);
// annotator.setSectionAnnotationPatterns(sectionAnnotations);
// annotator.setSsplitDiscardTokensMatcher(ssplitDiscardTokens);
return annotator;
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return "clean.xmltags:"
+ properties.getProperty("clean.xmltags", CleanXmlAnnotator.DEFAULT_XML_TAGS)
+ "clean.sentenceendingtags:"
+ properties.getProperty("clean.sentenceendingtags", CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS)
+ "clean.sentenceendingtags:"
+ properties.getProperty("clean.singlesentencetags", CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS)
+ "clean.allowflawedxml:"
+ properties.getProperty("clean.allowflawedxml", "")
+ "clean.datetags:"
+ properties.getProperty("clean.datetags", CleanXmlAnnotator.DEFAULT_DATE_TAGS)
+ "clean.docidtags:"
+ properties.getProperty("clean.docid", CleanXmlAnnotator.DEFAULT_DOCID_TAGS)
+ "clean.doctypetags:"
+ properties.getProperty("clean.doctype", CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS)
+ "clean.turntags:"
+ properties.getProperty("clean.turntags", CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS)
+ "clean.speakertags:"
+ properties.getProperty("clean.speakertags", CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS)
+ "clean.docAnnotations:"
+ properties.getProperty("clean.docAnnotations", CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS)
+ "clean.tokenAnnotations:"
+ properties.getProperty("clean.tokenAnnotations", CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS)
+ "clean.sectiontags:"
+ properties.getProperty("clean.sectiontags", CleanXmlAnnotator.DEFAULT_SECTION_TAGS)
+ "clean.sectionAnnotations:"
+ properties
.getProperty("clean.sectionAnnotations", CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS);
}
});
//
// Sentence splitter: splits the above sequence of tokens into
// sentences. This is required when processing entire documents or
// text consisting of multiple sentences.
//
pool.register(STANFORD_SSPLIT, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
boolean nlSplitting = Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"));
if (nlSplitting) {
boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
if (whitespaceTokenization) {
if (System.getProperty("line.separator").equals("\n")) {
return WordsToSentencesAnnotator.newlineSplitter(false, "\n");
} else {
// throw "\n" in just in case files use that instead of
// the system separator
return WordsToSentencesAnnotator.newlineSplitter(false, System.getProperty("line.separator"), "\n");
}
} else {
return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken());
}
} else {
// Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one
// sentence.
String isOneSentence = properties.getProperty("ssplit.isOneSentence");
if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
return WordsToSentencesAnnotator.nonSplitter(false);
}
// multi token sentence boundaries
String boundaryMultiTokenRegex = properties.getProperty("ssplit.boundaryMultiTokenRegex");
// Discard these tokens without marking them as sentence boundaries
String tokenPatternsToDiscardProp = properties.getProperty("ssplit.tokenPatternsToDiscard");
Set<String> tokenRegexesToDiscard = null;
if (tokenPatternsToDiscardProp != null) {
String[] toks = tokenPatternsToDiscardProp.split(",");
tokenRegexesToDiscard = Generics.newHashSet(Arrays.asList(toks));
}
// regular boundaries
String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
Set<String> boundariesToDiscard = null;
// newline boundaries which are discarded.
String bounds = properties.getProperty("ssplit.boundariesToDiscard");
if (bounds != null) {
String[] toks = bounds.split(",");
boundariesToDiscard = Generics.newHashSet(Arrays.asList(toks));
}
Set<String> htmlElementsToDiscard = null;
// HTML boundaries which are discarded
bounds = properties.getProperty("ssplit.htmlBoundariesToDiscard");
if (bounds != null) {
String[] elements = bounds.split(",");
htmlElementsToDiscard = Generics.newHashSet(Arrays.asList(elements));
}
String nlsb = properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK);
return new WordsToSentencesAnnotator(false, boundaryTokenRegex, boundariesToDiscard, htmlElementsToDiscard,
nlsb, boundaryMultiTokenRegex, tokenRegexesToDiscard);
}
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
StringBuilder os = new StringBuilder();
os.append(NEWLINE_SPLITTER_PROPERTY + ":" + properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"));
if (Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"))) {
os.append("tokenize.whitespace:" + properties.getProperty("tokenize.whitespace", "false"));
} else {
os.append("ssplit.isOneSentence:" + properties.getProperty("ssplit.isOneSentence", "false"));
if (!Boolean.valueOf(properties.getProperty("ssplit.isOneSentence", "false"))) {
os.append("ssplit.boundaryTokenRegex:" + properties.getProperty("ssplit.boundaryTokenRegex", ""));
os.append("ssplit.boundariesToDiscard:" + properties.getProperty("ssplit.boundariesToDiscard", ""));
os.append("ssplit.htmlBoundariesToDiscard:" + properties.getProperty("ssplit.htmlBoundariesToDiscard", ""));
os.append(NEWLINE_IS_SENTENCE_BREAK_PROPERTY + ":"
+ properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK));
}
}
return os.toString();
}
});
//
// POS tagger
//
pool.register(STANFORD_POS, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
try {
// EXTENSION
return new OwnPOSTaggerAnnotator(inputProps.getProperty("pos.model", DefaultPaths.DEFAULT_POS_MODEL),
inputProps);
// return new POSTaggerAnnotator(inputProps.getProperty("pos.model",
// DefaultPaths.DEFAULT_POS_MODEL), inputProps);
// -->EXTENSION
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return ("pos.maxlen:" + properties.getProperty("pos.maxlen", "") + "pos.model:"
+ properties.getProperty("pos.model", DefaultPaths.DEFAULT_POS_MODEL) + "pos.nthreads:" + properties
.getProperty("pos.nthreads", properties.getProperty("nthreads", "")));
}
});
//
// Lemmatizer
//
pool.register(STANFORD_LEMMA, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
// return new MorphaAnnotator(false);
return new OwnMorphaAnnotator(false, lang.equals("en"));
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
// nothing for this one
return "";
}
});
//
// NER
//
pool.register(STANFORD_NER, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
List<String> models = new ArrayList<String>();
String modelNames = properties.getProperty("ner.model");
if (modelNames == null) {
modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + "," + DefaultPaths.DEFAULT_NER_MUC_MODEL + ","
+ DefaultPaths.DEFAULT_NER_CONLL_MODEL;
}
if (modelNames.length() > 0) {
models.addAll(Arrays.asList(modelNames.split(",")));
}
if (models.isEmpty()) {
// Allow for no real NER model - can just use numeric classifiers or SUTime.
// Have to unset ner.model, so unlikely that people got here by accident.
System.err.println("WARNING: no NER models specified");
}
NERClassifierCombiner nerCombiner;
try {
boolean applyNumericClassifiers = PropertiesUtils.getBool(properties,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
boolean useSUTime = PropertiesUtils.getBool(properties, NumberSequenceClassifier.USE_SUTIME_PROPERTY,
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, useSUTime, properties, models
.toArray(new String[models.size()]));
} catch (FileNotFoundException e) {
throw new RuntimeIOException(e);
}
return new NERCombinerAnnotator(nerCombiner, false);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return "ner.model:"
+ properties.getProperty("ner.model", "")
+ "ner.model.3class:"
+ properties.getProperty("ner.model.3class", DefaultPaths.DEFAULT_NER_THREECLASS_MODEL)
+ "ner.model.7class:"
+ properties.getProperty("ner.model.7class", DefaultPaths.DEFAULT_NER_MUC_MODEL)
+ "ner.model.MISCclass:"
+ properties.getProperty("ner.model.MISCclass", DefaultPaths.DEFAULT_NER_CONLL_MODEL)
+ NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY
+ ":"
+ properties.getProperty(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY,
Boolean.toString(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT))
+ NumberSequenceClassifier.USE_SUTIME_PROPERTY
+ ":"
+ properties.getProperty(NumberSequenceClassifier.USE_SUTIME_PROPERTY,
Boolean.toString(NumberSequenceClassifier.USE_SUTIME_DEFAULT));
}
});
//
// Regex NER
//
pool.register(STANFORD_REGEXNER, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
return new TokensRegexNERAnnotator("regexner", properties);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return PropertiesUtils.getSignature("regexner", properties, TokensRegexNERAnnotator.SUPPORTED_PROPERTIES);
}
});
//
// Gender Annotator
//
pool.register(STANFORD_GENDER, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
return new GenderAnnotator(false, properties.getProperty("gender.firstnames",
DefaultPaths.DEFAULT_GENDER_FIRST_NAMES));
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return "gender.firstnames:"
+ properties.getProperty("gender.firstnames", DefaultPaths.DEFAULT_GENDER_FIRST_NAMES);
}
});
//
// True caser
//
pool.register(STANFORD_TRUECASE, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
String model = properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL);
String bias = properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS);
String mixed = properties.getProperty("truecase.mixedcasefile",
DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
return new TrueCaseAnnotator(model, bias, mixed, false);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return "truecase.model:" + properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL)
+ "truecase.bias:" + properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS)
+ "truecase.mixedcasefile:"
+ properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
}
});
//
// Parser
//
pool.register(STANFORD_PARSE, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
String parserType = properties.getProperty("parse.type", "stanford");
String maxLenStr = properties.getProperty("parse.maxlen");
if (parserType.equalsIgnoreCase("stanford")) {
ParserAnnotator anno = new ParserAnnotator("parse", properties);
return anno;
} else if (parserType.equalsIgnoreCase("charniak")) {
String model = properties.getProperty("parse.model");
String parserExecutable = properties.getProperty("parse.executable");
if (model == null || parserExecutable == null) {
throw new RuntimeException(
"Both parse.model and parse.executable properties must be specified if parse.type=charniak");
}
int maxLen = 399;
if (maxLenStr != null) {
maxLen = Integer.parseInt(maxLenStr);
}
CharniakParserAnnotator anno = new CharniakParserAnnotator(model, parserExecutable, false, maxLen);
return anno;
} else {
throw new RuntimeException("Unknown parser type: " + parserType
+ " (currently supported: stanford and charniak)");
}
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
String type = properties.getProperty("parse.type", "stanford");
if (type.equalsIgnoreCase("stanford")) {
return ParserAnnotator.signature("parser", properties);
} else if (type.equalsIgnoreCase("charniak")) {
return "parse.model:" + properties.getProperty("parse.model", "") + "parse.executable:"
+ properties.getProperty("parse.executable", "") + "parse.maxlen:"
+ properties.getProperty("parse.maxlen", "");
} else {
throw new RuntimeException("Unknown parser type: " + type + " (currently supported: stanford and charniak)");
}
}
});
//
// Coreference resolution
//
pool.register(STANFORD_DETERMINISTIC_COREF, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
return new DeterministicCorefAnnotator(properties);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return DeterministicCorefAnnotator.signature(properties);
}
});
// add annotators loaded via reflection from classnames specified
// in the properties
for (Object propertyKey : inputProps.stringPropertyNames()) {
if (!(propertyKey instanceof String))
continue; // should this be an Exception?
String property = (String) propertyKey;
if (property.startsWith(CUSTOM_ANNOTATOR_PREFIX)) {
final String customName = property.substring(CUSTOM_ANNOTATOR_PREFIX.length());
final String customClassName = inputProps.getProperty(property);
System.err.println("Registering annotator " + customName + " with class " + customClassName);
pool.register(customName, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
private final String name = customName;
private final String className = customClassName;
@Override
public Annotator create() {
return ReflectionLoading.loadByReflection(className, name, properties);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
// since we don't know what props they need, let's copy all
// TODO: can we do better here? maybe signature() should be a method in the Annotator?
StringBuilder os = new StringBuilder();
for (Object key : properties.keySet()) {
String skey = (String) key;
os.append(skey + ":" + properties.getProperty(skey));
}
return os.toString();
}
});
}
}
pool.register(STANFORD_RELATION, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
return new RelationExtractorAnnotator(properties);
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
return "sup.relation.verbose:" + properties.getProperty("sup.relation.verbose", "false")
+ properties.getProperty("sup.relation.model", DefaultPaths.DEFAULT_SUP_RELATION_EX_RELATION_MODEL);
}
});
pool.register(STANFORD_SENTIMENT, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
@Override
public Annotator create() {
return new SentimentAnnotator(STANFORD_SENTIMENT, properties);
}
@Override
public String signature() {
return "model=" + inputProps.get("model");
}
});
//
// Psudophrase generation
//
pool.register(NORMALIZATION, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
public Annotator create() {
return new NormalizerAnnotator();
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
StringBuilder os = new StringBuilder();
// no used props for this one
return os.toString();
}
});
//
// Stopword checking
//
pool.register(STOPWORD_CHECK, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
public Annotator create() {
return new StopWordAnnotator();
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
StringBuilder os = new StringBuilder();
// no used props for this one
return os.toString();
}
});
//
// MWE annotation
//
pool.register(MWE, new AnnotatorFactory(inputProps) {
private static final long serialVersionUID = 1L;
public Annotator create() {
return new MweDictAnnotator(false, inputProps.getProperty("mwe.file"));
}
@Override
public String signature() {
// keep track of all relevant properties for this annotator here!
StringBuilder os = new StringBuilder();
// no used props for this one
return os.toString();
}
});
// -->EXTENSION
return pool;
}
public static synchronized Annotator getExistingAnnotator(String name) {
if (pool == null) {
System.err.println("ERROR: attempted to fetch annotator \"" + name + "\" before the annotator pool was created!");
return null;
}
try {
Annotator a = pool.get(name);
return a;
} catch (IllegalArgumentException e) {
System.err.println("ERROR: attempted to fetch annotator \"" + name
+ "\" but the annotator pool does not store any such type!");
return null;
}
}
@Override
public void annotate(Annotation annotation) {
super.annotate(annotation);
List<CoreLabel> words = annotation.get(CoreAnnotations.TokensAnnotation.class);
if (words != null) {
numWords += words.size();
}
}
/**
* Prints the list of properties required to run the pipeline
*
* @param os
* PrintStream to print usage to
*/
private static void printRequiredProperties(PrintStream os) {
// TODO some annotators (ssplit, regexner, gender, some parser options, dcoref?) are not documented
os.println("The following properties can be defined:");
os.println("(if -props or -annotators is not passed in, default properties will be loaded via the classpath)");
os.println("\t\"props\" - path to file with configuration properties");
os.println("\t\"annotators\" - comma separated list of annotators");
os.println("\tThe following annotators are supported: cleanxml, tokenize, ssplit, pos, lemma, ner, truecase, parse, coref, dcoref, relation");
os.println();
os.println("\tIf annotator \"tokenize\" is defined:");
os.println("\t\"tokenize.options\" - PTBTokenizer options (see edu.stanford.nlp.process.PTBTokenizer for details)");
os.println("\t\"tokenize.whitespace\" - If true, just use whitespace tokenization");
os.println();
os.println("\tIf annotator \"cleanxml\" is defined:");
os.println("\t\"clean.xmltags\" - regex of tags to extract text from");
os.println("\t\"clean.sentenceendingtags\" - regex of tags which mark sentence endings");
os.println("\t\"clean.allowflawedxml\" - if set to true, don't complain about XML errors");
os.println();
os.println("\tIf annotator \"pos\" is defined:");
os.println("\t\"pos.maxlen\" - maximum length of sentence to POS tag");
os.println("\t\"pos.model\" - path towards the POS tagger model");
os.println();
os.println("\tIf annotator \"ner\" is defined:");
os.println("\t\"ner.model.3class\" - path towards the three-class NER model");
os.println("\t\"ner.model.7class\" - path towards the seven-class NER model");
os.println("\t\"ner.model.MISCclass\" - path towards the NER model with a MISC class");
os.println();
os.println("\tIf annotator \"truecase\" is defined:");
os.println("\t\"truecase.model\" - path towards the true-casing model; default: "
+ DefaultPaths.DEFAULT_TRUECASE_MODEL);
os.println("\t\"truecase.bias\" - class bias of the true case model; default: "
+ TrueCaseAnnotator.DEFAULT_MODEL_BIAS);
os.println("\t\"truecase.mixedcasefile\" - path towards the mixed case file; default: "
+ DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
os.println();
os.println("\tIf annotator \"relation\" is defined:");
os.println("\t\"sup.relation.verbose\" - whether verbose or not");
os.println("\t\"sup.relation.model\" - path towards the relation extraction model");
os.println();
os.println("\tIf annotator \"parse\" is defined:");
os.println("\t\"parse.model\" - path towards the PCFG parser model");
/*
* XXX: unstable, do not use for now os.println(); os.println("\tIf annotator \"srl\" is defined:");
* os.println
* ("\t\"srl.verb.args\" - path to the file listing verbs and their core arguments (\"verbs.core_args\")"
* ); os.println(
* "\t\"srl.model.id\" - path prefix for the role identification model (adds \".model.gz\" and \".fe\" to this prefix)"
* ); os.println(
* "\t\"srl.model.cls\" - path prefix for the role classification model (adds \".model.gz\" and \".fe\" to this prefix)"
* ); os.println(
* "\t\"srl.model.jic\" - path to the directory containing the joint model's \"model.gz\", \"fe\" and \"je\" files"
* ); os.println("\t (if not specified, the joint model will not be used)");
*/
os.println();
os.println("Command line properties:");
os.println("\t\"file\" - run the pipeline on the content of this file, or on the content of the files in this directory");
os.println("\t XML output is generated for every input file \"file\" as file.xml");
os.println("\t\"extension\" - if -file used with a directory, process only the files with this extension");
os.println("\t\"filelist\" - run the pipeline on the list of files given in this file");
os.println("\t output is generated for every input file as file.outputExtension");
os.println("\t\"outputDirectory\" - where to put output (defaults to the current directory)");
os.println("\t\"outputExtension\" - extension to use for the output file (defaults to \".xml\" for XML, \".ser.gz\" for serialized). Don't forget the dot!");
os.println("\t\"outputFormat\" - \"xml\" to output XML (default), \"serialized\" to output serialized Java objects, \"text\" to output text");
os.println("\t\"serializer\" - Class of annotation serializer to use when outputFormat is \"serialized\". By default, uses Java serialization.");
os.println("\t\"replaceExtension\" - flag to chop off the last extension before adding outputExtension to file");
os.println("\t\"noClobber\" - don't automatically override (clobber) output files that already exist");
os.println("\t\"threads\" - multithread on this number of threads");
os.println();
os.println("If none of the above are present, run the pipeline in an interactive shell (default properties will be loaded from the classpath).");
os.println("The shell accepts input from stdin and displays the output at stdout.");
os.println();
os.println("Run with -help [topic] for more help on a specific topic.");
os.println("Current topics include: parser");
os.println();
}
/**
* {@inheritDoc}
*/
@Override
public String timingInformation() {
StringBuilder sb = new StringBuilder(super.timingInformation());
if (TIME && numWords >= 0) {
long total = this.getTotalTime();
sb.append(" for ").append(this.numWords).append(" tokens at ");
sb.append(String.format("%.1f", numWords / (((double) total) / 1000)));
sb.append(" tokens/sec.");
}
return sb.toString();
}
}