/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.resource.ResourceInitializationException;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.apache.uima.fit.util.JCasUtil.select;
/**
* Annotate phrases in a sentence. Depending on the provided unigrams and the threshold, these
* comprise either one or two annotations (tokens, lemmas, ...).
* <p>
* In order to identify longer phrases, run the {@link FrequencyCounter} and this annotator
* multiple times, each time taking the results of the previous run as input. From the second run on, set phrases
* in the feature path parameter {@link #PARAM_FEATURE_PATH}.
*/
public class PhraseAnnotator
extends JCasAnnotator_ImplBase
{
/**
* The feature path to use for building bigrams. Default: tokens.
*/
public static final String PARAM_FEATURE_PATH = "featurePath";
@ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = false)
private String featurePath;
private static final String DEFAULT_FEATURE_PATH = Token.class.getCanonicalName();
/**
* If true, lowercase everything.
*/
public static final String PARAM_LOWERCASE = "PARAM_LOWERCASE";
@ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false")
private boolean lowercase;
/**
* The file providing the unigram and bigram unigrams to use.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true)
private String modelLocation;
/**
* The discount in order to prevent too many phrases consisting of very infrequent words to be formed.
* A typical value is the minimum count set during model creation ({@link FrequencyCounter#PARAM_MIN_COUNT}),
* which is by default set to 5.
*/
public static final String PARAM_DISCOUNT = "discount";
@ConfigurationParameter(name = PARAM_DISCOUNT, mandatory = true, defaultValue = "5")
private int discount;
/**
* The threshold score for phrase construction. Default is 100. Lower values result in fewer phrases.
* The value strongly depends on the size of the corpus and the token unigrams.
*/
public static final String PARAM_THRESHOLD = "threshold";
@ConfigurationParameter(name = PARAM_THRESHOLD, mandatory = true, defaultValue = "100")
private float threshold;
public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
@ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "")
private String stopwordsFile;
public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement";
@ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "")
private String stopwordsReplacement;
public static final String PARAM_FILTER_REGEX = "filterRegex";
@ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "")
private String filterRegex;
public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement";
@ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "")
private String regexReplacement;
/**
* Set this parameter if bigrams should only be counted when occurring within a covering type, e.g. sentences.
*/
public static final String PARAM_COVERING_TYPE = "coveringType";
@ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false)
private String coveringType;
private Map<String, Integer> unigrams;
private Map<String, Integer> bigrams;
private int vocabularySize;
private PhraseSequenceGenerator sequenceGenerator;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
/* set feature path to default */
if (featurePath == null) {
featurePath = DEFAULT_FEATURE_PATH;
}
try {
sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.coveringType(coveringType)
.lowercase(lowercase)
.stopwordsFile(stopwordsFile)
.stopwordsReplacement(stopwordsReplacement)
.filterRegex(filterRegex)
.filterRegexReplacement(regexReplacement)
.build();
readCounts();
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
vocabularySize = unigrams.size();
getLogger().info("Vocabulary size: " + vocabularySize);
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
List<LexicalPhrase[]> sequences;
try {
sequences = sequenceGenerator.tokenSequences(aJCas);
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
/* remove existing phrases */
select(aJCas, LexicalPhrase.class).forEach(TOP::removeFromIndexes);
for (LexicalPhrase[] sequence : sequences) {
/* iterate over sequences in document */
for (int i = 0; i < sequence.length; i++) {
/* iterate over tokens within sequence */
LexicalPhrase phrase1 = sequence[i];
String token1 = phrase1.getText();
LexicalPhrase newPhrase = phrase1;
if (i < sequence.length - 1) {
/* do not look for bigram on last token */
LexicalPhrase phrase2 = sequence[i + 1];
String token2 = phrase2.getText();
String bigram = token1 + FrequencyCounter.BIGRAM_SEPARATOR + token2;
if (bigrams.containsKey(bigram)) {
assert unigrams.containsKey(token1);
assert unigrams.containsKey(token2);
/* compute score */
double score =
(double) ((bigrams.get(bigram) - discount) * vocabularySize) /
(double) (unigrams.get(token1) * unigrams.get(token2));
getLogger().debug(bigram + "\t" + score);
if (score >= threshold) {
/* bigram phrase spanning two tokens found */
newPhrase = new LexicalPhrase(aJCas, phrase1.getBegin(),
phrase2.getEnd());
newPhrase.setText(bigram);
i++; // skip succeeding token
}
}
}
newPhrase.addToIndexes(aJCas);
}
}
}
/**
* Read the input file, adding unigrams and bigrams to the respective maps.
*
* @throws IOException if the input file cannot be read
*/
private void readCounts()
throws IOException
{
unigrams = new HashMap<>();
bigrams = new HashMap<>();
getLogger().info("Reading frequencies from " + modelLocation);
BufferedReader reader = new BufferedReader(new InputStreamReader(CompressionUtils
.getInputStream(modelLocation, new FileInputStream(modelLocation))));
boolean countingUnigrams = true;
String line;
while ((line = reader.readLine()) != null) {
if (line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) {
/* this should only happen once per file */
if (!countingUnigrams) {
throw new IllegalStateException(
"Error reading input file; contains multiple separation lines.");
}
countingUnigrams = false;
}
else {
String[] columns = line.split(FrequencyCounter.COLUMN_SEPARATOR);
if (columns.length != 2) {
throw new IllegalStateException("Invalid line in input file:\n" + line);
}
String token = columns[0];
int count = Integer.parseInt(columns[1]);
if (countingUnigrams) {
if (unigrams.containsKey(token)) {
throw new IllegalStateException(
"Duplicate token in input file: '" + token + "'.");
}
unigrams.put(token, count);
}
else {
if (bigrams.containsKey(token)) {
throw new IllegalStateException(
"Duplicate token in input file: '" + token + "'.");
}
bigrams.put(token, count);
}
}
}
reader.close();
}
}