package edu.stanford.nlp.pipeline; import java.io.Reader; import java.io.StringReader; import java.util.*; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.*; import edu.stanford.nlp.international.spanish.process.SpanishTokenizer; import edu.stanford.nlp.international.french.process.FrenchTokenizer; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.logging.Redwood; /** * This class will PTB tokenize the input. It assumes that the original * String is under the CoreAnnotations.TextAnnotation field * and it will add the output from the * InvertiblePTBTokenizer ({@code List<CoreLabel>}) under * CoreAnnotation.TokensAnnotation. * * @author Jenny Finkel * @author Christopher Manning * @author Ishita Prasad */ public class TokenizerAnnotator implements Annotator { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(TokenizerAnnotator.class); /** * Enum to identify the different TokenizerTypes. To add a new * TokenizerType, add it to the list with a default options string * and add a clause in getTokenizerType to identify it. */ public enum TokenizerType { Unspecified(null, null, "invertible,ptb3Escaping=true"), Arabic ("ar", null, ""), Chinese ("zh", null, ""), Spanish ("es", "SpanishTokenizer", "invertible,ptb3Escaping=true,splitAll=true"), English ("en", "PTBTokenizer", "invertible,ptb3Escaping=true"), German ("de", null, "invertible,ptb3Escaping=true"), French ("fr", "FrenchTokenizer", ""), Whitespace (null, "WhitespaceTokenizer", ""); private final String abbreviation; private final String className; private final String defaultOptions; TokenizerType(String abbreviation, String className, String defaultOptions) { this.abbreviation = abbreviation; this.className = className; this.defaultOptions = defaultOptions; } public String getDefaultOptions() { return defaultOptions; } private static final Map<String, TokenizerType> nameToTokenizerMap = initializeNameMap(); private static Map<String, TokenizerType> initializeNameMap() { Map<String, TokenizerType> map = Generics.newHashMap(); for (TokenizerType type : TokenizerType.values()) { if (type.abbreviation != null) { map.put(type.abbreviation.toUpperCase(), type); } map.put(type.toString().toUpperCase(), type); } return Collections.unmodifiableMap(map); } private static final Map<String, TokenizerType> classToTokenizerMap = initializeClassMap(); private static Map<String, TokenizerType> initializeClassMap() { Map<String, TokenizerType> map = Generics.newHashMap(); for (TokenizerType type : TokenizerType.values()) { if (type.className != null) { map.put(type.className.toUpperCase(), type); } } return Collections.unmodifiableMap(map); } /** * Get TokenizerType based on what's in the properties. * * @param props Properties to find tokenizer options in * @return An element of the TokenizerType enum indicating the tokenizer to use */ public static TokenizerType getTokenizerType(Properties props) { String tokClass = props.getProperty("tokenize.class", null); boolean whitespace = Boolean.valueOf(props.getProperty("tokenize.whitespace", "false")); String language = props.getProperty("tokenize.language", null); if(whitespace) { return Whitespace; } if (tokClass != null) { TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase()); if (type == null) { throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass); } return type; } if (language != null) { TokenizerType type = nameToTokenizerMap.get(language.toUpperCase()); if (type == null) { throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language); } return type; } return Unspecified; } } // end enum TokenizerType public static final String EOL_PROPERTY = "tokenize.keepeol"; private final boolean VERBOSE; private final TokenizerFactory<CoreLabel> factory; /** new segmenter properties **/ private final boolean useSegmenter; private final Annotator segmenterAnnotator; // CONSTRUCTORS /** Gives a non-verbose, English tokenizer. */ public TokenizerAnnotator() { this(false); } private static String computeExtraOptions(Properties properties) { String extraOptions = null; boolean keepNewline = Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")); // ssplit.eolonly String hasSsplit = properties.getProperty("annotators"); if (hasSsplit != null && hasSsplit.contains(StanfordCoreNLP.STANFORD_SSPLIT)) { // ssplit // Only possibly put in *NL* if not all one (the Boolean method treats null as false) if ( ! Boolean.parseBoolean(properties.getProperty("ssplit.isOneSentence"))) { // Set to { NEVER, ALWAYS, TWO_CONSECUTIVE } based on ssplit.newlineIsSentenceBreak String nlsbString = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK); WordToSentenceProcessor.NewlineIsSentenceBreak nlsb = WordToSentenceProcessor.stringToNewlineIsSentenceBreak(nlsbString); if (nlsb != WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER) { keepNewline = true; } } } if (keepNewline) { extraOptions = "tokenizeNLs,"; } return extraOptions; } public TokenizerAnnotator(Properties properties) { this(false, properties, computeExtraOptions(properties)); } public TokenizerAnnotator(boolean verbose) { this(verbose, TokenizerType.English); } public TokenizerAnnotator(String lang) { this(true, lang, null); } public TokenizerAnnotator(boolean verbose, TokenizerType lang) { this(verbose, lang.toString()); } public TokenizerAnnotator(boolean verbose, String lang) { this(verbose, lang, null); } public TokenizerAnnotator(boolean verbose, String lang, String options) { this(verbose, lang == null ? null : PropertiesUtils.asProperties("tokenize.language", lang), options); } public TokenizerAnnotator(boolean verbose, Properties props) { this(verbose, props, null); } public TokenizerAnnotator(boolean verbose, Properties props, String options) { if (props == null) { props = new Properties(); } // check if segmenting must be done if (props.getProperty("tokenize.language") != null && LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language"))) { useSegmenter = true; if (LanguageInfo.getLanguageFromString( props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props); else if (LanguageInfo.getLanguageFromString( props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE) segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props); else { segmenterAnnotator = null; throw new RuntimeException("No segmenter implemented for: "+ LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language"))); } } else { useSegmenter = false; segmenterAnnotator = null; } VERBOSE = PropertiesUtils.getBool(props, "tokenize.verbose", verbose); TokenizerType type = TokenizerType.getTokenizerType(props); factory = initFactory(type, props, options); } /** * initFactory returns the right type of TokenizerFactory based on the options in the properties file * and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve * your tokenizer from the properties file, and then add a class is the switch structure here to * instantiate the new Tokenizer type. * * @param type the TokenizerType * @param props the properties file * @param extraOptions extra things that should be passed into the tokenizer constructor */ private static TokenizerFactory<CoreLabel> initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException{ TokenizerFactory<CoreLabel> factory; String options = props.getProperty("tokenize.options", null); // set it to the equivalent of both extraOptions and options // TODO: maybe we should always have getDefaultOptions() and // expect the user to turn off default options. That would // require all options to have negated options, but // currently there are some which don't have that if (options == null) { options = type.getDefaultOptions(); } if (extraOptions != null) { if (extraOptions.endsWith(",")) { options = extraOptions + options; } else { options = extraOptions + ',' + options; } } switch(type) { case Arabic: case Chinese: factory = null; break; case Spanish: factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options); break; case French: factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options); break; case Whitespace: boolean eolIsSignificant = Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false")); eolIsSignificant = eolIsSignificant || Boolean.valueOf(props.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")); factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), eolIsSignificant); break; case English: case German: factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options); break; case Unspecified: log.info("No tokenizer type provided. Defaulting to PTBTokenizer."); factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options); break; default: throw new IllegalArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer."); } return factory; } /** * Returns a thread-safe tokenizer */ public Tokenizer<CoreLabel> getTokenizer(Reader r) { return factory.getTokenizer(r); } /** * Does the actual work of splitting TextAnnotation into CoreLabels, * which are then attached to the TokensAnnotation. */ @Override public void annotate(Annotation annotation) { if (VERBOSE) { log.info("Tokenizing ... "); } // for Arabic and Chinese use a segmenter instead if (useSegmenter) { segmenterAnnotator.annotate(annotation); return; } if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in-memory String unless you need the readLine() method! List<CoreLabel> tokens = getTokenizer(r).tokenize(); // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory // for (CoreLabel token: tokens) { // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class)); // } annotation.set(CoreAnnotations.TokensAnnotation.class, tokens); if (VERBOSE) { log.info("done."); log.info("Tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); } } else { throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation); } } @Override public Set<Class<? extends CoreAnnotation>> requires() { return Collections.emptySet(); } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return new HashSet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class )); } }