// // StanfordCoreNLP -- a suite of NLP tools. // Copyright (c) 2009-2011 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // package edu.stanford.nlp.pipeline; import hu.u_szeged.nlp.pos.MagyarlancResourceHolder; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.ie.NERClassifierCombiner; import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.ReflectionLoading; /** * This is a pipeline that takes in a string and returns various analyzed linguistic forms. The String is * tokenized via a tokenizer (such as PTBTokenizerAnnotator), and then other sequence model style annotation * can be used to add things like lemmas, POS tags, and named entities. These are returned as a list of * CoreLabels. Other analysis components build and store parse trees, dependency graphs, etc. * <p> * This class is designed to apply multiple Annotators to an Annotation. The idea is that you first build up * the pipeline by adding Annotators, and then you take the objects you wish to annotate and pass them in and * get in return a fully annotated object. At the command-line level you can, e.g., tokenize text with * StanfordCoreNLP with a command like: <br/> * * <pre> * java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit -file document.txt * </pre> * * <br/> * Please see the package level javadoc for sample usage and a more complete description. * <p> * The main entry point for the API is StanfordCoreNLP.process() . * <p> * <i>Implementation note:</i> There are other annotation pipelines, but they don't extend this one. Look for * classes that implement Annotator and which have "Pipeline" in their name. * * @author Jenny Finkel * @author Anna Rafferty * @author Christopher Manning * @author Mihai Surdeanu * @author Steven Bethard */ public class SzTECoreNLP extends SzTEAnnotationPipeline { // EXTENSION public static String lang = null; /* * List of all known annotator property names Add new annotators and/or annotators from other groups here! */ private static final String NORMALIZATION = "normalize"; private static final String STOPWORD_CHECK = "stopword"; private static final String MWE = "mwe"; // --> EXTENSION // other constants public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass."; private static final String PROPS_SUFFIX = ".properties"; public static final String NEWLINE_SPLITTER_PROPERTY = "ssplit.eolonly"; public static final String NEWLINE_IS_SENTENCE_BREAK_PROPERTY = "ssplit.newlineIsSentenceBreak"; public static final String DEFAULT_NEWLINE_IS_SENTENCE_BREAK = "two"; /** Stores the overall number of words processed */ private int numWords; /** Maintains the shared pool of annotators */ public static AnnotatorPool pool = null; private Properties properties; /** * Constructs a pipeline using as properties the properties file found in the classpath */ public SzTECoreNLP() { this((Properties) null); } /** * Construct a basic pipeline. The Properties will be used to determine which annotators to create, and a * default AnnotatorPool will be used to create the annotators. * */ public SzTECoreNLP(Properties props) { this(props, (props == null || PropertiesUtils.getBool(props, "enforceRequirements", true))); } public SzTECoreNLP(Properties props, boolean enforceRequirements) { construct(props, enforceRequirements); } /** * Constructs a pipeline with the properties read from this file, which must be found in the classpath * * @param propsFileNamePrefix */ public SzTECoreNLP(String propsFileNamePrefix) { this(propsFileNamePrefix, true); } public SzTECoreNLP(String propsFileNamePrefix, boolean enforceRequirements) { Properties props = loadProperties(propsFileNamePrefix); if (props == null) { throw new RuntimeIOException("ERROR: cannot find properties file \"" + propsFileNamePrefix + "\" in the classpath!"); } construct(props, enforceRequirements); } // // property-specific methods // private static String getRequiredProperty(Properties props, String name) { String val = props.getProperty(name); if (val == null) { System.err.println("Missing property \"" + name + "\"!"); printRequiredProperties(System.err); throw new RuntimeException("Missing property: \"" + name + '\"'); } return val; } /** * Finds the properties file in the classpath and loads the properties from there. * * @return The found properties object (must be not-null) * @throws RuntimeException * If no properties file can be found on the classpath */ private static Properties loadPropertiesFromClasspath() { List<String> validNames = Arrays.asList("StanfordCoreNLP", "edu.stanford.nlp.pipeline.StanfordCoreNLP"); for (String name : validNames) { Properties props = loadProperties(name); if (props != null) return props; } throw new RuntimeException("ERROR: Could not find properties file in the classpath!"); } private static Properties loadProperties(String name) { return loadProperties(name, Thread.currentThread().getContextClassLoader()); } private static Properties loadProperties(String name, ClassLoader loader) { if (name.endsWith(PROPS_SUFFIX)) name = name.substring(0, name.length() - PROPS_SUFFIX.length()); name = name.replace('.', '/'); name += PROPS_SUFFIX; Properties result = null; // Returns null on lookup failures System.err.println("Searching for resource: " + name); InputStream in = loader.getResourceAsStream(name); try { if (in != null) { InputStreamReader reader = new InputStreamReader(in, "utf-8"); result = new Properties(); result.load(reader); // Can throw IOException } } catch (IOException e) { result = null; } finally { IOUtils.closeIgnoringExceptions(in); } return result; } /** Fetches the Properties object used to construct this Annotator */ public Properties getProperties() { return properties; } public String getEncoding() { return properties.getProperty("encoding", "UTF-8"); } // // AnnotatorPool construction support // private void construct(Properties props, boolean enforceRequirements) { this.numWords = 0; if (props == null) { // if undefined, find the properties file in the classpath props = loadPropertiesFromClasspath(); } else if (props.getProperty("annotators") == null) { // this happens when some command line options are specified (e.g just "-filelist") but no properties // file is. // we use the options that are given and let them override the default properties from the class path // properties. Properties fromClassPath = loadPropertiesFromClasspath(); fromClassPath.putAll(props); props = fromClassPath; } // EXTENSION lang = props.getProperty("lang", "en"); if (!props.containsKey("pos.model")) { if (lang.equals("hu")) { MagyarlancResourceHolder.initCorpus(System.getProperty("user.dir") + "/resources/magyarlanc/szeged_2_3.lex"); MagyarlancResourceHolder.initFrequencies(System.getProperty("user.dir") + "/resources/magyarlanc/szeged_2_3.freq"); MagyarlancResourceHolder.initRFSA(System.getProperty("user.dir") + "/resources/magyarlanc/rfsa.txt"); MagyarlancResourceHolder.initCorrDic(System.getProperty("user.dir") + "/resources/magyarlanc/corrdic.txt"); MagyarlancResourceHolder.initMorPhonDir(); MagyarlancResourceHolder.initMSDReducer(); MagyarlancResourceHolder.initKRToMSD(); props.put("pos.model", System.getProperty("user.dir") + "/resources/magyarlanc/szeged_2_3.model"); } else { props.put("pos.model", DefaultPaths.DEFAULT_POS_MODEL); } } // -->EXTENSION this.properties = props; AnnotatorPool pool = getDefaultAnnotatorPool(props); // now construct the annotators from the given properties in the given order List<String> annoNames = Arrays.asList(getRequiredProperty(props, "annotators").split("[, \t]+")); Set<String> alreadyAddedAnnoNames = new HashSet<String>(); Set<Requirement> requirementsSatisfied = new HashSet<Requirement>(); for (String name : annoNames) { name = name.trim(); if (name.isEmpty()) { continue; } System.err.println("Adding annotator " + name); Annotator an = pool.get(name); this.addAnnotator(an); if (enforceRequirements) { Set<Requirement> allRequirements = an.requires(); for (Requirement requirement : allRequirements) { if (!requirementsSatisfied.contains(requirement)) { String fmt = "annotator \"%s\" requires annotator \"%s\""; throw new IllegalArgumentException(String.format(fmt, name, requirement)); } } requirementsSatisfied.addAll(an.requirementsSatisfied()); } alreadyAddedAnnoNames.add(name); } // Sanity check if (!alreadyAddedAnnoNames.contains(STANFORD_SSPLIT)) { System.setProperty(NEWLINE_SPLITTER_PROPERTY, "false"); } } /** * Call this if you are no longer using StanfordCoreNLP and want to release the memory associated with the * annotators. */ public static synchronized void clearAnnotatorPool() { pool = null; } private static synchronized AnnotatorPool getDefaultAnnotatorPool(final Properties inputProps) { // if the pool already exists reuse! if (pool == null) { // first time we get here pool = new AnnotatorPool(); } // // tokenizer: breaks text into a sequence of tokens // this is required for all following annotators! // pool.register(STANFORD_TOKENIZE, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { if (Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"))) { return new WhitespaceTokenizerAnnotator(properties); } else { String options = properties.getProperty("tokenize.options", PTBTokenizerAnnotator.DEFAULT_OPTIONS); boolean keepNewline = Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false")); // If they if (properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY) != null) { keepNewline = true; } // If the user specifies "tokenizeNLs=false" in tokenize.options, then this default will // be overridden. if (keepNewline) { options = "tokenizeNLs," + options; } // EXTENSION if (lang.equals("hu")) { return new HunTokenizerAnnotator(false, options); } else { return new PTBTokenizerAnnotator(false, options); } // return new PTBTokenizerAnnotator(false, options); // -->EXTENSION } } @Override public String signature() { // keep track of all relevant properties for this annotator here! StringBuilder os = new StringBuilder(); os.append("tokenize.whitespace:" + properties.getProperty("tokenize.whitespace", "false")); if (Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"))) { os.append(WhitespaceTokenizerAnnotator.EOL_PROPERTY + ":" + properties.getProperty(WhitespaceTokenizerAnnotator.EOL_PROPERTY, "false")); os.append(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY + ":" + properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")); return os.toString(); } else { os.append(NEWLINE_SPLITTER_PROPERTY + ":" + Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"))); os.append(NEWLINE_IS_SENTENCE_BREAK_PROPERTY + ":" + properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK)); } return os.toString(); } }); pool.register(STANFORD_CLEAN_XML, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { String xmlTags = properties.getProperty("clean.xmltags", CleanXmlAnnotator.DEFAULT_XML_TAGS); String sentenceEndingTags = properties.getProperty("clean.sentenceendingtags", CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS); // String singleSentenceTags = properties.getProperty("clean.singlesentencetags", // CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS); // String allowFlawedString = properties.getProperty("clean.allowflawedxml"); // boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS; // if (allowFlawedString != null) // allowFlawed = Boolean.valueOf(allowFlawedString); String dateTags = properties.getProperty("clean.datetags", CleanXmlAnnotator.DEFAULT_DATE_TAGS); // String docIdTags = properties.getProperty("clean.docIdtags", CleanXmlAnnotator.DEFAULT_DOCID_TAGS); // String docTypeTags = properties.getProperty("clean.docTypetags", // CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS); // String utteranceTurnTags = properties.getProperty("clean.turntags", // CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS); // String speakerTags = properties.getProperty("clean.speakertags", // CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS); // String docAnnotations = properties.getProperty("clean.docAnnotations", // CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS); // String tokenAnnotations = properties.getProperty("clean.tokenAnnotations", // CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS); // String sectionTags = properties.getProperty("clean.sectiontags", // CleanXmlAnnotator.DEFAULT_SECTION_TAGS); // String sectionAnnotations = properties.getProperty("clean.sectionAnnotations", // CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS); // String ssplitDiscardTokens = properties.getProperty("clean.ssplitDiscardTokens"); MyCleanXmlAnnotator annotator = new MyCleanXmlAnnotator(xmlTags, sentenceEndingTags, dateTags); // annotator.setSingleSentenceTagMatcher(singleSentenceTags); // annotator.setDocIdTagMatcher(docIdTags); // annotator.setDocTypeTagMatcher(docTypeTags); // annotator.setDiscourseTags(utteranceTurnTags, speakerTags); // annotator.setDocAnnotationPatterns(docAnnotations); // annotator.setTokenAnnotationPatterns(tokenAnnotations); // annotator.setSectionTagMatcher(sectionTags); // annotator.setSectionAnnotationPatterns(sectionAnnotations); // annotator.setSsplitDiscardTokensMatcher(ssplitDiscardTokens); return annotator; } @Override public String signature() { // keep track of all relevant properties for this annotator here! return "clean.xmltags:" + properties.getProperty("clean.xmltags", CleanXmlAnnotator.DEFAULT_XML_TAGS) + "clean.sentenceendingtags:" + properties.getProperty("clean.sentenceendingtags", CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS) + "clean.sentenceendingtags:" + properties.getProperty("clean.singlesentencetags", CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS) + "clean.allowflawedxml:" + properties.getProperty("clean.allowflawedxml", "") + "clean.datetags:" + properties.getProperty("clean.datetags", CleanXmlAnnotator.DEFAULT_DATE_TAGS) + "clean.docidtags:" + properties.getProperty("clean.docid", CleanXmlAnnotator.DEFAULT_DOCID_TAGS) + "clean.doctypetags:" + properties.getProperty("clean.doctype", CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS) + "clean.turntags:" + properties.getProperty("clean.turntags", CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS) + "clean.speakertags:" + properties.getProperty("clean.speakertags", CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS) + "clean.docAnnotations:" + properties.getProperty("clean.docAnnotations", CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS) + "clean.tokenAnnotations:" + properties.getProperty("clean.tokenAnnotations", CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS) + "clean.sectiontags:" + properties.getProperty("clean.sectiontags", CleanXmlAnnotator.DEFAULT_SECTION_TAGS) + "clean.sectionAnnotations:" + properties .getProperty("clean.sectionAnnotations", CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS); } }); // // Sentence splitter: splits the above sequence of tokens into // sentences. This is required when processing entire documents or // text consisting of multiple sentences. // pool.register(STANFORD_SSPLIT, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { boolean nlSplitting = Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false")); if (nlSplitting) { boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false")); if (whitespaceTokenization) { if (System.getProperty("line.separator").equals("\n")) { return WordsToSentencesAnnotator.newlineSplitter(false, "\n"); } else { // throw "\n" in just in case files use that instead of // the system separator return WordsToSentencesAnnotator.newlineSplitter(false, System.getProperty("line.separator"), "\n"); } } else { return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken()); } } else { // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one // sentence. String isOneSentence = properties.getProperty("ssplit.isOneSentence"); if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false return WordsToSentencesAnnotator.nonSplitter(false); } // multi token sentence boundaries String boundaryMultiTokenRegex = properties.getProperty("ssplit.boundaryMultiTokenRegex"); // Discard these tokens without marking them as sentence boundaries String tokenPatternsToDiscardProp = properties.getProperty("ssplit.tokenPatternsToDiscard"); Set<String> tokenRegexesToDiscard = null; if (tokenPatternsToDiscardProp != null) { String[] toks = tokenPatternsToDiscardProp.split(","); tokenRegexesToDiscard = Generics.newHashSet(Arrays.asList(toks)); } // regular boundaries String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex"); Set<String> boundariesToDiscard = null; // newline boundaries which are discarded. String bounds = properties.getProperty("ssplit.boundariesToDiscard"); if (bounds != null) { String[] toks = bounds.split(","); boundariesToDiscard = Generics.newHashSet(Arrays.asList(toks)); } Set<String> htmlElementsToDiscard = null; // HTML boundaries which are discarded bounds = properties.getProperty("ssplit.htmlBoundariesToDiscard"); if (bounds != null) { String[] elements = bounds.split(","); htmlElementsToDiscard = Generics.newHashSet(Arrays.asList(elements)); } String nlsb = properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK); return new WordsToSentencesAnnotator(false, boundaryTokenRegex, boundariesToDiscard, htmlElementsToDiscard, nlsb, boundaryMultiTokenRegex, tokenRegexesToDiscard); } } @Override public String signature() { // keep track of all relevant properties for this annotator here! StringBuilder os = new StringBuilder(); os.append(NEWLINE_SPLITTER_PROPERTY + ":" + properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false")); if (Boolean.valueOf(properties.getProperty(NEWLINE_SPLITTER_PROPERTY, "false"))) { os.append("tokenize.whitespace:" + properties.getProperty("tokenize.whitespace", "false")); } else { os.append("ssplit.isOneSentence:" + properties.getProperty("ssplit.isOneSentence", "false")); if (!Boolean.valueOf(properties.getProperty("ssplit.isOneSentence", "false"))) { os.append("ssplit.boundaryTokenRegex:" + properties.getProperty("ssplit.boundaryTokenRegex", "")); os.append("ssplit.boundariesToDiscard:" + properties.getProperty("ssplit.boundariesToDiscard", "")); os.append("ssplit.htmlBoundariesToDiscard:" + properties.getProperty("ssplit.htmlBoundariesToDiscard", "")); os.append(NEWLINE_IS_SENTENCE_BREAK_PROPERTY + ":" + properties.getProperty(NEWLINE_IS_SENTENCE_BREAK_PROPERTY, DEFAULT_NEWLINE_IS_SENTENCE_BREAK)); } } return os.toString(); } }); // // POS tagger // pool.register(STANFORD_POS, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { try { // EXTENSION return new OwnPOSTaggerAnnotator(inputProps.getProperty("pos.model", DefaultPaths.DEFAULT_POS_MODEL), inputProps); // return new POSTaggerAnnotator(inputProps.getProperty("pos.model", // DefaultPaths.DEFAULT_POS_MODEL), inputProps); // -->EXTENSION } catch (Exception e) { throw new RuntimeException(e); } } @Override public String signature() { // keep track of all relevant properties for this annotator here! return ("pos.maxlen:" + properties.getProperty("pos.maxlen", "") + "pos.model:" + properties.getProperty("pos.model", DefaultPaths.DEFAULT_POS_MODEL) + "pos.nthreads:" + properties .getProperty("pos.nthreads", properties.getProperty("nthreads", ""))); } }); // // Lemmatizer // pool.register(STANFORD_LEMMA, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { // return new MorphaAnnotator(false); return new OwnMorphaAnnotator(false, lang.equals("en")); } @Override public String signature() { // keep track of all relevant properties for this annotator here! // nothing for this one return ""; } }); // // NER // pool.register(STANFORD_NER, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { List<String> models = new ArrayList<String>(); String modelNames = properties.getProperty("ner.model"); if (modelNames == null) { modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + "," + DefaultPaths.DEFAULT_NER_MUC_MODEL + "," + DefaultPaths.DEFAULT_NER_CONLL_MODEL; } if (modelNames.length() > 0) { models.addAll(Arrays.asList(modelNames.split(","))); } if (models.isEmpty()) { // Allow for no real NER model - can just use numeric classifiers or SUTime. // Have to unset ner.model, so unlikely that people got here by accident. System.err.println("WARNING: no NER models specified"); } NERClassifierCombiner nerCombiner; try { boolean applyNumericClassifiers = PropertiesUtils.getBool(properties, NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY, NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT); boolean useSUTime = PropertiesUtils.getBool(properties, NumberSequenceClassifier.USE_SUTIME_PROPERTY, NumberSequenceClassifier.USE_SUTIME_DEFAULT); nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, useSUTime, properties, models .toArray(new String[models.size()])); } catch (FileNotFoundException e) { throw new RuntimeIOException(e); } return new NERCombinerAnnotator(nerCombiner, false); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return "ner.model:" + properties.getProperty("ner.model", "") + "ner.model.3class:" + properties.getProperty("ner.model.3class", DefaultPaths.DEFAULT_NER_THREECLASS_MODEL) + "ner.model.7class:" + properties.getProperty("ner.model.7class", DefaultPaths.DEFAULT_NER_MUC_MODEL) + "ner.model.MISCclass:" + properties.getProperty("ner.model.MISCclass", DefaultPaths.DEFAULT_NER_CONLL_MODEL) + NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY + ":" + properties.getProperty(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY, Boolean.toString(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT)) + NumberSequenceClassifier.USE_SUTIME_PROPERTY + ":" + properties.getProperty(NumberSequenceClassifier.USE_SUTIME_PROPERTY, Boolean.toString(NumberSequenceClassifier.USE_SUTIME_DEFAULT)); } }); // // Regex NER // pool.register(STANFORD_REGEXNER, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { return new TokensRegexNERAnnotator("regexner", properties); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return PropertiesUtils.getSignature("regexner", properties, TokensRegexNERAnnotator.SUPPORTED_PROPERTIES); } }); // // Gender Annotator // pool.register(STANFORD_GENDER, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { return new GenderAnnotator(false, properties.getProperty("gender.firstnames", DefaultPaths.DEFAULT_GENDER_FIRST_NAMES)); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return "gender.firstnames:" + properties.getProperty("gender.firstnames", DefaultPaths.DEFAULT_GENDER_FIRST_NAMES); } }); // // True caser // pool.register(STANFORD_TRUECASE, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { String model = properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL); String bias = properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS); String mixed = properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST); return new TrueCaseAnnotator(model, bias, mixed, false); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return "truecase.model:" + properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL) + "truecase.bias:" + properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS) + "truecase.mixedcasefile:" + properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST); } }); // // Parser // pool.register(STANFORD_PARSE, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { String parserType = properties.getProperty("parse.type", "stanford"); String maxLenStr = properties.getProperty("parse.maxlen"); if (parserType.equalsIgnoreCase("stanford")) { ParserAnnotator anno = new ParserAnnotator("parse", properties); return anno; } else if (parserType.equalsIgnoreCase("charniak")) { String model = properties.getProperty("parse.model"); String parserExecutable = properties.getProperty("parse.executable"); if (model == null || parserExecutable == null) { throw new RuntimeException( "Both parse.model and parse.executable properties must be specified if parse.type=charniak"); } int maxLen = 399; if (maxLenStr != null) { maxLen = Integer.parseInt(maxLenStr); } CharniakParserAnnotator anno = new CharniakParserAnnotator(model, parserExecutable, false, maxLen); return anno; } else { throw new RuntimeException("Unknown parser type: " + parserType + " (currently supported: stanford and charniak)"); } } @Override public String signature() { // keep track of all relevant properties for this annotator here! String type = properties.getProperty("parse.type", "stanford"); if (type.equalsIgnoreCase("stanford")) { return ParserAnnotator.signature("parser", properties); } else if (type.equalsIgnoreCase("charniak")) { return "parse.model:" + properties.getProperty("parse.model", "") + "parse.executable:" + properties.getProperty("parse.executable", "") + "parse.maxlen:" + properties.getProperty("parse.maxlen", ""); } else { throw new RuntimeException("Unknown parser type: " + type + " (currently supported: stanford and charniak)"); } } }); // // Coreference resolution // pool.register(STANFORD_DETERMINISTIC_COREF, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { return new DeterministicCorefAnnotator(properties); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return DeterministicCorefAnnotator.signature(properties); } }); // add annotators loaded via reflection from classnames specified // in the properties for (Object propertyKey : inputProps.stringPropertyNames()) { if (!(propertyKey instanceof String)) continue; // should this be an Exception? String property = (String) propertyKey; if (property.startsWith(CUSTOM_ANNOTATOR_PREFIX)) { final String customName = property.substring(CUSTOM_ANNOTATOR_PREFIX.length()); final String customClassName = inputProps.getProperty(property); System.err.println("Registering annotator " + customName + " with class " + customClassName); pool.register(customName, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; private final String name = customName; private final String className = customClassName; @Override public Annotator create() { return ReflectionLoading.loadByReflection(className, name, properties); } @Override public String signature() { // keep track of all relevant properties for this annotator here! // since we don't know what props they need, let's copy all // TODO: can we do better here? maybe signature() should be a method in the Annotator? StringBuilder os = new StringBuilder(); for (Object key : properties.keySet()) { String skey = (String) key; os.append(skey + ":" + properties.getProperty(skey)); } return os.toString(); } }); } } pool.register(STANFORD_RELATION, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { return new RelationExtractorAnnotator(properties); } @Override public String signature() { // keep track of all relevant properties for this annotator here! return "sup.relation.verbose:" + properties.getProperty("sup.relation.verbose", "false") + properties.getProperty("sup.relation.model", DefaultPaths.DEFAULT_SUP_RELATION_EX_RELATION_MODEL); } }); pool.register(STANFORD_SENTIMENT, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; @Override public Annotator create() { return new SentimentAnnotator(STANFORD_SENTIMENT, properties); } @Override public String signature() { return "model=" + inputProps.get("model"); } }); // // Psudophrase generation // pool.register(NORMALIZATION, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; public Annotator create() { return new NormalizerAnnotator(); } @Override public String signature() { // keep track of all relevant properties for this annotator here! StringBuilder os = new StringBuilder(); // no used props for this one return os.toString(); } }); // // Stopword checking // pool.register(STOPWORD_CHECK, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; public Annotator create() { return new StopWordAnnotator(); } @Override public String signature() { // keep track of all relevant properties for this annotator here! StringBuilder os = new StringBuilder(); // no used props for this one return os.toString(); } }); // // MWE annotation // pool.register(MWE, new AnnotatorFactory(inputProps) { private static final long serialVersionUID = 1L; public Annotator create() { return new MweDictAnnotator(false, inputProps.getProperty("mwe.file")); } @Override public String signature() { // keep track of all relevant properties for this annotator here! StringBuilder os = new StringBuilder(); // no used props for this one return os.toString(); } }); // -->EXTENSION return pool; } public static synchronized Annotator getExistingAnnotator(String name) { if (pool == null) { System.err.println("ERROR: attempted to fetch annotator \"" + name + "\" before the annotator pool was created!"); return null; } try { Annotator a = pool.get(name); return a; } catch (IllegalArgumentException e) { System.err.println("ERROR: attempted to fetch annotator \"" + name + "\" but the annotator pool does not store any such type!"); return null; } } @Override public void annotate(Annotation annotation) { super.annotate(annotation); List<CoreLabel> words = annotation.get(CoreAnnotations.TokensAnnotation.class); if (words != null) { numWords += words.size(); } } /** * Prints the list of properties required to run the pipeline * * @param os * PrintStream to print usage to */ private static void printRequiredProperties(PrintStream os) { // TODO some annotators (ssplit, regexner, gender, some parser options, dcoref?) are not documented os.println("The following properties can be defined:"); os.println("(if -props or -annotators is not passed in, default properties will be loaded via the classpath)"); os.println("\t\"props\" - path to file with configuration properties"); os.println("\t\"annotators\" - comma separated list of annotators"); os.println("\tThe following annotators are supported: cleanxml, tokenize, ssplit, pos, lemma, ner, truecase, parse, coref, dcoref, relation"); os.println(); os.println("\tIf annotator \"tokenize\" is defined:"); os.println("\t\"tokenize.options\" - PTBTokenizer options (see edu.stanford.nlp.process.PTBTokenizer for details)"); os.println("\t\"tokenize.whitespace\" - If true, just use whitespace tokenization"); os.println(); os.println("\tIf annotator \"cleanxml\" is defined:"); os.println("\t\"clean.xmltags\" - regex of tags to extract text from"); os.println("\t\"clean.sentenceendingtags\" - regex of tags which mark sentence endings"); os.println("\t\"clean.allowflawedxml\" - if set to true, don't complain about XML errors"); os.println(); os.println("\tIf annotator \"pos\" is defined:"); os.println("\t\"pos.maxlen\" - maximum length of sentence to POS tag"); os.println("\t\"pos.model\" - path towards the POS tagger model"); os.println(); os.println("\tIf annotator \"ner\" is defined:"); os.println("\t\"ner.model.3class\" - path towards the three-class NER model"); os.println("\t\"ner.model.7class\" - path towards the seven-class NER model"); os.println("\t\"ner.model.MISCclass\" - path towards the NER model with a MISC class"); os.println(); os.println("\tIf annotator \"truecase\" is defined:"); os.println("\t\"truecase.model\" - path towards the true-casing model; default: " + DefaultPaths.DEFAULT_TRUECASE_MODEL); os.println("\t\"truecase.bias\" - class bias of the true case model; default: " + TrueCaseAnnotator.DEFAULT_MODEL_BIAS); os.println("\t\"truecase.mixedcasefile\" - path towards the mixed case file; default: " + DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST); os.println(); os.println("\tIf annotator \"relation\" is defined:"); os.println("\t\"sup.relation.verbose\" - whether verbose or not"); os.println("\t\"sup.relation.model\" - path towards the relation extraction model"); os.println(); os.println("\tIf annotator \"parse\" is defined:"); os.println("\t\"parse.model\" - path towards the PCFG parser model"); /* * XXX: unstable, do not use for now os.println(); os.println("\tIf annotator \"srl\" is defined:"); * os.println * ("\t\"srl.verb.args\" - path to the file listing verbs and their core arguments (\"verbs.core_args\")" * ); os.println( * "\t\"srl.model.id\" - path prefix for the role identification model (adds \".model.gz\" and \".fe\" to this prefix)" * ); os.println( * "\t\"srl.model.cls\" - path prefix for the role classification model (adds \".model.gz\" and \".fe\" to this prefix)" * ); os.println( * "\t\"srl.model.jic\" - path to the directory containing the joint model's \"model.gz\", \"fe\" and \"je\" files" * ); os.println("\t (if not specified, the joint model will not be used)"); */ os.println(); os.println("Command line properties:"); os.println("\t\"file\" - run the pipeline on the content of this file, or on the content of the files in this directory"); os.println("\t XML output is generated for every input file \"file\" as file.xml"); os.println("\t\"extension\" - if -file used with a directory, process only the files with this extension"); os.println("\t\"filelist\" - run the pipeline on the list of files given in this file"); os.println("\t output is generated for every input file as file.outputExtension"); os.println("\t\"outputDirectory\" - where to put output (defaults to the current directory)"); os.println("\t\"outputExtension\" - extension to use for the output file (defaults to \".xml\" for XML, \".ser.gz\" for serialized). Don't forget the dot!"); os.println("\t\"outputFormat\" - \"xml\" to output XML (default), \"serialized\" to output serialized Java objects, \"text\" to output text"); os.println("\t\"serializer\" - Class of annotation serializer to use when outputFormat is \"serialized\". By default, uses Java serialization."); os.println("\t\"replaceExtension\" - flag to chop off the last extension before adding outputExtension to file"); os.println("\t\"noClobber\" - don't automatically override (clobber) output files that already exist"); os.println("\t\"threads\" - multithread on this number of threads"); os.println(); os.println("If none of the above are present, run the pipeline in an interactive shell (default properties will be loaded from the classpath)."); os.println("The shell accepts input from stdin and displays the output at stdout."); os.println(); os.println("Run with -help [topic] for more help on a specific topic."); os.println("Current topics include: parser"); os.println(); } /** * {@inheritDoc} */ @Override public String timingInformation() { StringBuilder sb = new StringBuilder(super.timingInformation()); if (TIME && numWords >= 0) { long total = this.getTotalTime(); sb.append(" for ").append(this.numWords).append(" tokens at "); sb.append(String.format("%.1f", numWords / (((double) total) / 1000))); sb.append(" tokens/sec."); } return sb.toString(); } }