/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.stanfordnlp;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.LanguageCapability;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import edu.stanford.nlp.international.arabic.process.ArabicTokenizer;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.process.WordToSentenceProcessor.NewlineIsSentenceBreak;
/**
* Stanford sentence splitter and tokenizer.
*/
@LanguageCapability({"en", "es", "fr"})
public class StanfordSegmenter
extends SegmenterBase
{
private static final Map<String, InternalTokenizerFactory> tokenizerFactories;
// private static final Map<String, TreebankLanguagePack> languagePacks;
static {
tokenizerFactories = new HashMap<String, InternalTokenizerFactory>();
// tokenizerFactories.put("ar", new InternalArabicTokenizerFactory());
tokenizerFactories.put("en", new InternalPTBTokenizerFactory());
tokenizerFactories.put("es", new InternalSpanishTokenizerFactory());
tokenizerFactories.put("fr", new InternalFrenchTokenizerFactory());
// The Negra tokenizer is not really a full tokenizer.
// tokenizerFactories.put("de", new InternalNegraPennTokenizerFactory());
// Not sure if those really work - don't know how to test
// tokenizerFactories.put("zh", new InternalCHTBTokenizerFactory());
// languagePacks = new HashMap<String, TreebankLanguagePack>();
// languagePacks.put("en", new PennTreebankLanguagePack());
// languagePacks.put("zh", new ChineseTreebankLanguagePack());
// languagePacks.put("en", new ArabicTreebankLanguagePack());
// languagePacks.put("de", new NegraPennLanguagePack());
}
/**
* If this component is not configured for a specific language and if the language stored in
* the document metadata is not supported, use the given language as a fallback.
*/
public static final String PARAM_LANGUAGE_FALLBACK = "languageFallback";
@ConfigurationParameter(name = PARAM_LANGUAGE_FALLBACK, mandatory = false)
private String languageFallback;
/**
* The set of boundary tokens. If null, use default.
*
* @see WordToSentenceProcessor#WordToSentenceProcessor
*/
public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex";
@ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX)
private String boundaryTokenRegex;
/**
* This is a Set of String that are matched with .equals() which are allowed to be tacked onto
* the end of a sentence after a sentence boundary token, for example ")".
*
* @see WordToSentenceProcessor#DEFAULT_BOUNDARY_FOLLOWERS_REGEX
*/
public static final String PARAM_BOUNDARY_FOLLOWERS_REGEX = "boundaryFollowersRegex";
@ConfigurationParameter(name = PARAM_BOUNDARY_FOLLOWERS_REGEX, mandatory = false, defaultValue =
WordToSentenceProcessor.DEFAULT_BOUNDARY_FOLLOWERS_REGEX)
private String boundaryFollowersRegex;
/**
* These are elements like "p" or "sent", which will be wrapped into regex for approximate XML
* matching. They will be deleted in the output, and will always trigger a sentence boundary.
*/
public static final String PARAM_XML_BREAK_ELEMENTS_TO_DISCARD = "xmlBreakElementsToDiscard";
@ConfigurationParameter(name = PARAM_XML_BREAK_ELEMENTS_TO_DISCARD, mandatory = false)
private Set<String> xmlBreakElementsToDiscard;
/**
* The set of regex for sentence boundary tokens that should be discarded.
*
* @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD
*/
public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard";
@ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = {
"\n", "*NL*" })
private Set<String> boundariesToDiscard;
/**
* A regular expression for element names containing a sentence region. Only tokens in such
* elements will be included in sentences. The start and end tags themselves are not included in
* the sentence.
*/
public static final String PARAM_REGION_ELEMENT_REGEX = "regionElementRegex";
@ConfigurationParameter(name = PARAM_REGION_ELEMENT_REGEX, mandatory = false)
private String regionElementRegex;
/**
* Strategy for treating newlines as paragraph breaks.
*/
public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak";
@ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "TWO_CONSECUTIVE")
private NewlineIsSentenceBreak newlineIsSentenceBreak;
/**
* The set of regex for sentence boundary tokens that should be discarded.
*/
public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard";
@ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, defaultValue = {})
private Set<String> tokenRegexesToDiscard;
/**
* Whether to treat all input as one sentence.
*/
public static final String PARAM_IS_ONE_SENTENCE = "isOneSentence";
@ConfigurationParameter(name = PARAM_IS_ONE_SENTENCE, mandatory = true, defaultValue = "false")
private boolean isOneSentence;
/**
* Whether to generate empty sentences.
*/
public static final String PARAM_ALLOW_EMPTY_SENTENCES = "allowEmptySentences";
@ConfigurationParameter(name = PARAM_ALLOW_EMPTY_SENTENCES, mandatory = true, defaultValue = "false")
private boolean allowEmptySentences;
/**
* Additional options that should be passed to the tokenizers. The available options depend on
* the language-specific tokenizer being used.
*/
private String[] additionalOptions;
@Override
protected void process(JCas aJCas, String aText, int aZoneBegin)
throws AnalysisEngineProcessException
{
List<Token> casTokens = null;
// Use value from language parameter, document language or fallback language - whatever
// is available
String language = getLanguage(aJCas);
if (isWriteToken()) {
casTokens = new ArrayList<Token>();
final Tokenizer<?> tokenizer = getTokenizer(language, aText);
List<?> tokens = tokenizer.tokenize();
for (int i = 0; i < tokens.size(); i++) {
final Object token = tokens.get(i);
// System.out.println("Token class: "+token.getClass());
CoreLabel l = (CoreLabel) token;
String t = l.word();
int begin = l.get(CharacterOffsetBeginAnnotation.class);
int end = l.get(CharacterOffsetEndAnnotation.class);
casTokens.add(createToken(aJCas, t, aZoneBegin + begin, aZoneBegin + end));
}
}
if (isWriteSentence()) {
if (casTokens == null) {
casTokens = selectCovered(aJCas, Token.class, aZoneBegin,
aZoneBegin + aText.length());
}
// Prepare the tokens for processing by WordToSentenceProcessor
List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
Pattern nlPattern = Pattern.compile(".*(\r\n|\n|\r).*");
Matcher nlMatcher = nlPattern.matcher("");
int lastTokenEnd = 0;
for (Token token : casTokens) {
if (!NewlineIsSentenceBreak.NEVER.equals(newlineIsSentenceBreak)) {
// add newline as token for newlineIsSentenceBreak parameter
nlMatcher.reset(aJCas.getDocumentText().subSequence(lastTokenEnd, token.getBegin()));
if (nlMatcher.matches()) {
CoreLabel l = new CoreLabel();
l.set(CharacterOffsetBeginAnnotation.class, lastTokenEnd + nlMatcher.start(1));
l.set(CharacterOffsetEndAnnotation.class, lastTokenEnd + nlMatcher.end(1));
l.setWord("\n");
tokensInDocument.add(l);
}
}
lastTokenEnd = token.getEnd();
// add regular token
CoreLabel l = new CoreLabel();
l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
l.setWord(token.getCoveredText());
tokensInDocument.add(l);
}
// The sentence splitter (probably) requires the escaped text, so we prepare it here
PTBEscapingProcessor escaper = new PTBEscapingProcessor();
escaper.apply(tokensInDocument);
// Apply the WordToSentenceProcessor to find the sentence boundaries
WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(
boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard,
xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null,
tokenRegexesToDiscard, isOneSentence, allowEmptySentences);
List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
for (List<CoreLabel> sentence : sentencesInDocument) {
int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
int end = sentence.get(sentence.size()-1).get(CharacterOffsetEndAnnotation.class);
createSentence(aJCas, begin, end);
}
}
}
private
Tokenizer getTokenizer(
final String aLanguage,
final String aText) throws AnalysisEngineProcessException
{
InternalTokenizerFactory tk = tokenizerFactories.get(aLanguage);
if (tk == null) {
if (languageFallback == null) {
throw new AnalysisEngineProcessException(Messages.BUNDLE,
Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aLanguage });
}
else {
tk = tokenizerFactories.get(languageFallback);
if (tk == null) {
throw new AnalysisEngineProcessException(Messages.BUNDLE,
Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { languageFallback });
}
}
}
return tk.create(aText);
}
private static
interface InternalTokenizerFactory
{
Tokenizer<?> create(String s);
}
private static
class InternalPTBTokenizerFactory
implements InternalTokenizerFactory
{
@Override
public
Tokenizer<?> create(
final String s)
{
// TokenizerFactory<CoreLabel> f = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=false");
return new PTBTokenizer<CoreLabel>(new StringReader(s),new CoreLabelTokenFactory(),"invertible");
}
}
// The InternalNegraPennTokenizer is not meant for German text. It
// is for parsing a particular corpus format.
// private static
// class InternalNegraPennTokenizerFactory
// implements InternalTokenizerFactory
// {
// @Override
// public
// Tokenizer<?> create(
// final String s)
// {
// return new NegraPennTokenizer(new StringReader(s));
// }
// }
private static
class InternalArabicTokenizerFactory
implements InternalTokenizerFactory
{
@Override
public
Tokenizer<?> create(
final String s)
{
return ArabicTokenizer.newArabicTokenizer(new StringReader(s), new Properties());
}
}
private static
class InternalFrenchTokenizerFactory
implements InternalTokenizerFactory
{
@Override
public
Tokenizer<?> create(
final String s)
{
return FrenchTokenizer.factory().getTokenizer(new StringReader(s), "tokenizeNLs=false");
}
}
private static
class InternalSpanishTokenizerFactory
implements InternalTokenizerFactory
{
@Override
public
Tokenizer<?> create(
final String s)
{
return SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer(
new StringReader(s));
}
}
// While the stanford parser should come with a proper tokenizer for
// Chinese (because it can parse chinese text), this does not seem to be
// the right one or I am using it wrong. The associated test cases do not
// work.
// private static
// class InternalCHTBTokenizerFactory
// implements InternalTokenizerFactory
// {
// @Override
// public
// Tokenizer<?> create(
// final String s)
// {
// return new CHTBTokenizer(new StringReader(s));
// }
// }
}