/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.corenlp;
import java.io.IOException;
import java.net.URL;
import java.util.Properties;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.TokenizerAnnotator;
import edu.stanford.nlp.pipeline.WordsToSentencesAnnotator;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.CoreMap;
/**
* Tokenizer and sentence splitter using from CoreNLP.
*/
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" })
public class CoreNlpSegmenter
extends SegmenterBase
{
private boolean verbose;
/**
* The set of boundary tokens. If null, use default.
*
* @see WordToSentenceProcessor#WordToSentenceProcessor
*/
public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex";
@ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX)
private String boundaryTokenRegex;
public static final String PARAM_BOUNDARY_MULTI_TOKEN_REGEX = "boundaryMultiTokenRegex";
@ConfigurationParameter(name = PARAM_BOUNDARY_MULTI_TOKEN_REGEX, mandatory = false)
private String boundaryMultiTokenRegex;
/**
* These are elements like "p" or "sent", which will be wrapped into regex for approximate XML
* matching. They will be deleted in the output, and will always trigger a sentence boundary.
*/
public static final String PARAM_HTML_ELEMENTS_TO_DISCARD = "htmlElementsToDiscard";
@ConfigurationParameter(name = PARAM_HTML_ELEMENTS_TO_DISCARD, mandatory = false)
private Set<String> htmlElementsToDiscard;
/**
* The set of regex for sentence boundary tokens that should be discarded.
*
* @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD
*/
public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard";
@ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = {
"\n", "*NL*" })
private Set<String> boundaryToDiscard;
/**
* Strategy for treating newlines as sentence breaks.
*/
public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak";
@ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "two")
private String newlineIsSentenceBreak;
/**
* The set of regex for sentence boundary tokens that should be discarded.
*/
public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard";
@ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, defaultValue = {})
private Set<String> tokenRegexesToDiscard;
private ModelProviderBase<WordsToSentencesAnnotator> sentenceAnnotator;
private ModelProviderBase<TokenizerAnnotator> tokenizerAnnotator;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
tokenizerAnnotator = new ModelProviderBase<TokenizerAnnotator>(this, "corenlp", "tokenizer")
{
{
setDefault(LOCATION, NOT_REQUIRED);
}
@Override
protected TokenizerAnnotator produceResource(URL aUrl)
throws IOException
{
Properties props = getAggregatedProperties();
Properties coreNlpProps = new Properties();
coreNlpProps.setProperty("tokenize.language", props.getProperty(LANGUAGE));
//coreNlpProps.setProperty("tokenize.class", null);
//coreNlpProps.setProperty("tokenize.whitespace", "false");
//coreNlpProps.setProperty("tokenize.options", null);
//coreNlpProps.setProperty("tokenize.keepeol", "false");
String extraOptions = null;
TokenizerAnnotator annotator = new TokenizerAnnotator(verbose, coreNlpProps,
extraOptions);
return annotator;
}
};
sentenceAnnotator = new ModelProviderBase<WordsToSentencesAnnotator>(this, "corenlp", "sentence")
{
{
setDefault(LOCATION, NOT_REQUIRED);
}
@Override
protected WordsToSentencesAnnotator produceResource(URL aUrl)
throws IOException
{
WordsToSentencesAnnotator annotator = new WordsToSentencesAnnotator(verbose,
boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard,
newlineIsSentenceBreak, boundaryMultiTokenRegex, tokenRegexesToDiscard);
return annotator;
}
};
}
@Override
protected void process(JCas aJCas, String aText, int aZoneBegin)
throws AnalysisEngineProcessException
{
Annotation document = new Annotation(aText);
if (isWriteToken()) {
tokenizerAnnotator.configure(aJCas.getCas());
tokenizerAnnotator.getResource().annotate(document);
for (CoreLabel token : document.get(CoreAnnotations.TokensAnnotation.class)) {
createToken(aJCas,
token.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin,
token.get(CharacterOffsetEndAnnotation.class) + aZoneBegin);
}
}
if (isWriteSentence()) {
sentenceAnnotator.configure(aJCas.getCas());
sentenceAnnotator.getResource().annotate(document);
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
createSentence(aJCas,
sentence.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin,
sentence.get(CharacterOffsetEndAnnotation.class) + aZoneBegin);
}
}
}
}