/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathUtils;
import de.tudarmstadt.ukp.dkpro.core.api.io.TextUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.jcas.JCas;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
/**
* Generate sequences of phrases with optional stopword/regex-based filtering, and lowercasing.
* Filtered tokens are added as {@link LexicalPhrase}s with empty text or a replacement of the text,
* if {@link Builder#stopwordReplacement} and/or {@link Builder#filterRegexReplacement} were set.
* <p>
* Initialize with {@link Builder#build()}.
* <p>
* When strings instead of {@link LexicalPhrase}s should be output, use {@link Builder#buildStringSequenceGenerator()}.
*
* @since 1.9.0
*/
public class PhraseSequenceGenerator
{
public static final String WHITESPACE_CHAR_REPLACEMENT = "</s>";
private final boolean lowercase;
private final Optional<String> coveringTypeName;
private final String filterRegexReplacement;
@SuppressWarnings("SpellCheckingInspection") private final Set<String> filterRegexes;
private final String stopwordReplacement;
private final Collection<String> stopwords;
private final String featurePath;
private final int minTokenLength;
private final boolean useCharacters;
private PhraseSequenceGenerator(Builder builder)
throws IOException
{
this.lowercase = builder.lowercase;
this.coveringTypeName = builder.coveringType;
this.minTokenLength = builder.minTokenLength;
this.featurePath = builder.featurePath;
stopwords = builder.stopwordsFile.isPresent()
? TextUtils.readStopwordsURL(builder.stopwordsFile.get(), lowercase)
: Collections.emptySet();
this.stopwordReplacement = builder.stopwordsReplacement;
this.filterRegexes = builder.filterRegexes;
this.filterRegexReplacement = builder.filterRegexReplacement;
this.useCharacters = builder.characters;
}
/**
* Generate a list of {@link LexicalPhrase} sequences where each list element represents phrases
* extracted from the covering types, e.g. a sentence. If no covering type was defined, the list
* contains one element representing the whole document.
*
* @param aJCas
* a {@link JCas}
* @return a list of {@link LexicalPhrase} arrays
* @throws FeaturePathException
* if there was a problem creating the feature path.
*/
public List<LexicalPhrase[]> tokenSequences(JCas aJCas)
throws FeaturePathException
{
return useCharacters ? characterSequences(aJCas) : annotationSequences(aJCas);
}
/**
* Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
* <p>
* If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
* extracted. Otherwise, the result contains only one element.
*
* @param aJCas
* a {@link JCas}
* @return a list of {@link LexicalPhrase} arrays
* @throws FeaturePathException
* if there was a problem creating the feature path.
*/
private List<LexicalPhrase[]> annotationSequences(JCas aJCas)
throws FeaturePathException
{
List<LexicalPhrase[]> phrases = new ArrayList<>();
if (coveringTypeName.isPresent()) {
Type coveringType = FeaturePathUtils
.getType(aJCas.getTypeSystem(), coveringTypeName.get());
/* iterate over covering annotations */
for (AnnotationFS covering : CasUtil.select(aJCas.getCas(), coveringType)) {
phrases.add(annotationSequence(aJCas, Optional.of(covering)));
}
}
else {
/* add a single token sequence for the whole document */
phrases.add(annotationSequence(aJCas, Optional.empty()));
}
return phrases;
}
/**
* Extract a list of {@link LexicalPhrase} arrays from the {@link JCas}.
* <p>
* If {@link #coveringTypeName} is set, a dedicated array for each covering annotation is
* extracted. Otherwise, the result contains only one element.
*
* @param aJCas
* a {@link JCas}
* @return a list of {@link LexicalPhrase} arrays
* @throws FeaturePathException
* if there was a problem creating the feature path.
*/
private List<LexicalPhrase[]> characterSequences(JCas aJCas)
throws FeaturePathException
{
if (coveringTypeName.isPresent()) {
Type coveringType = FeaturePathUtils
.getType(aJCas.getTypeSystem(), coveringTypeName.get());
return CasUtil.select(aJCas.getCas(), coveringType).stream()
.map(covering -> characterSequence(aJCas, covering.getCoveredText(),
covering.getBegin()))
.collect(Collectors.toList());
}
else {
return Collections.singletonList(characterSequence(aJCas, aJCas.getDocumentText(), 0));
}
}
/**
* Generate an array of {@link LexicalPhrase}s from features (e.g. tokens or lemmas) covered by
* an annotation (e.g. a sentence). If no coveringAnnotation is set, return all features in the
* CAS.
* <p>
* Optionally, the tokens are filtered by stopwords and/or regular expressions. In matching
* elements, the phrase texts are replaced according to {@link Builder#stopwordReplacement} and
* {@link Builder#filterRegexReplacement}.
*
* @param aJCas
* a {@link JCas}
* @param coveringAnnotation
* an Optional covering annotation from which tokens are selected, e.g. a
* {@link Sentence}
* @return an array of {@link LexicalPhrase}s representing all extracted tokens
* @throws FeaturePathException
* if the annotation type specified in PARAM_TOKEN_FEATURE_PATH cannot be extracted.
*/
private LexicalPhrase[] annotationSequence(JCas aJCas,
Optional<AnnotationFS> coveringAnnotation)
throws FeaturePathException
{
List<LexicalPhrase> sequence = new ArrayList<>();
FeaturePathFactory.FeaturePathIterator<AnnotationFS> valueIterator =
FeaturePathUtils.featurePathIterator(aJCas, featurePath, coveringAnnotation);
/* iterate over tokens (optionally within covering annotation) */
while (valueIterator.hasNext()) {
Map.Entry<AnnotationFS, String> entry = valueIterator.next();
AnnotationFS annotation = entry.getKey();
LexicalPhrase phrase = new LexicalPhrase(aJCas, annotation.getBegin(),
annotation.getEnd());
String text = entry.getValue();
/* transform text */
text = text.length() < minTokenLength ? "" : text;
text = lowercase ? text.toLowerCase() : text;
text = stopwords.contains(text) ? stopwordReplacement : text;
for (String filterRegex : filterRegexes) {
text = text.matches(filterRegex) ? filterRegexReplacement : text;
}
phrase.setText(text);
sequence.add(phrase);
}
return sequence.toArray(new LexicalPhrase[sequence.size()]);
}
/**
* Generate a sequence of {@link LexicalPhrase}s based on characters.
* <p>
* Whitespaces are replaced by {@link #WHITESPACE_CHAR_REPLACEMENT}. All characters that are
* neither alphabetic, digits, or whitespace are omitted.
*
* @param aJCas the {@link JCas}
* @param text the text to extract characters from
* @param begin the begin of the first {@link LexicalPhrase} annotation
* @return an array of {@link LexicalPhrase}s
*/
private LexicalPhrase[] characterSequence(JCas aJCas, String text, int begin)
{
List<LexicalPhrase> sequence = new ArrayList<>();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (Character.isAlphabetic(c) || Character.isDigit(c) || Character
.isWhitespace(c)) {
String s = Character.isWhitespace(c) ?
WHITESPACE_CHAR_REPLACEMENT :
String.valueOf(c);
if (lowercase) {
s = s.toLowerCase();
}
LexicalPhrase phrase = new LexicalPhrase(aJCas, begin + i, begin + i + 1);
phrase.setText(s);
sequence.add(phrase);
}
}
return sequence.toArray(new LexicalPhrase[sequence.size()]);
}
/**
* Builder for {@link PhraseSequenceGenerator}s.
* <p>
* Alternative constructs a {@link StringSequenceGenerator} with {@link #buildStringSequenceGenerator()}
*/
public static class Builder
{
private boolean lowercase = false;
private Optional<String> coveringType = Optional.empty();
private int minTokenLength = 0;
private Optional<URL> stopwordsFile = Optional.empty();
private String stopwordsReplacement = "";
private String featurePath = Token.class.getCanonicalName();
@SuppressWarnings("SpellCheckingInspection") private Set<String> filterRegexes = new HashSet<>();
private String filterRegexReplacement = "";
private boolean characters = false;
/**
* @param featurePath set the feature path to use for creating token sequences.
* @return a {@link Builder}
*/
public Builder featurePath(String featurePath)
{
this.featurePath = featurePath;
return this;
}
public Builder stopwordsFile(String stopwordsFile)
throws MalformedURLException
{
if (stopwordsFile.isEmpty()) {
this.stopwordsFile = Optional.empty();
return this;
} else {
return stopwordsFile(new File(stopwordsFile));
}
}
public Builder stopwordsFile(File stopwordsFile)
throws MalformedURLException
{
URL url = stopwordsFile.toURI().toURL();
return stopwordsURL(url);
}
/**
* @param stopwordsURL set the location of the stopwords file
* @return a {@link Builder}
*/
public Builder stopwordsURL(URL stopwordsURL)
{
this.stopwordsFile = Optional.of(stopwordsURL);
return this;
}
/**
* @param stopwordsReplacement stopwords are replaced by this string or removed if replacement string is empty
* @return a {@link Builder}
*/
public Builder stopwordsReplacement(String stopwordsReplacement)
{
this.stopwordsReplacement = stopwordsReplacement == null ? "" : stopwordsReplacement;
return this;
}
/**
* @param minTokenLength tokens shorter than the given length are filtered out
* @return a {@link Builder}
*/
@SuppressWarnings("unused") public Builder minTokenLength(int minTokenLength)
{
this.minTokenLength = minTokenLength;
return this;
}
/**
* This method can be called multiple times in order to add multiple regular expressions for filtering.
* If a token matches any of the regular expression, it is omitted.
*
* @param filterRegex Tokens matching this regular expression are filtered out.
* @return a {@link Builder}
*/
public Builder filterRegex(String filterRegex)
{
if (!filterRegex.isEmpty()) {
this.filterRegexes.add(filterRegex);
}
return this;
}
/**
* @param filterRegexReplacement tokens matching the {@link #filterRegexes} are replaced by this string. If this is empty, these tokens are removed.
* @return a {@link Builder}
*/
public Builder filterRegexReplacement(String filterRegexReplacement)
{
this.filterRegexReplacement =
filterRegexReplacement == null ? "" : filterRegexReplacement;
return this;
}
/**
* @param lowercase If true, all tokens are lowercased
* @return a {@link Builder}
*/
public Builder lowercase(boolean lowercase)
{
this.lowercase = lowercase;
return this;
}
/**
* @param coveringType if set, a separate string sequence is generated for each sequence covered
* by the covering type, e.g. one sequence for each sentence.
* @return a {@link Builder}
*/
public Builder coveringType(String coveringType)
{
this.coveringType = coveringType == null || coveringType.isEmpty() ?
Optional.empty() :
Optional.of(coveringType);
return this;
}
/**
* If set to true, the generated phrases contain characters instead of tokens or other annotations.
*
* @param characters a boolean
* @return a {@link Builder}
*/
public Builder characters(boolean characters)
{
this.characters = characters;
return this;
}
/**
* Generate a {@link PhraseSequenceGenerator}
*
* @return a {@link PhraseSequenceGenerator} instance
* @throws IOException if a stopwords file is specified but cannot be read
*/
public PhraseSequenceGenerator build()
throws IOException
{
return new PhraseSequenceGenerator(this);
}
/**
* Generate a {@link StringSequenceGenerator} that directly returns Strings
* instead of {@link LexicalPhrase}s.
*
* @return a {@link StringSequenceGenerator} instance
* @throws IOException if a stopwords file is specified but cannot be read
*/
public StringSequenceGenerator buildStringSequenceGenerator()
throws IOException
{
return new StringSequenceGenerator(this);
}
}
}