/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator;
import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.stream.Stream;
/**
* Count unigrams and bigrams in a collection.
*/
public class FrequencyCounter
extends JCasFileWriter_ImplBase
{
/**
* When concatenating multiple tokens, this string is inserted between them.
*/
static final String BIGRAM_SEPARATOR = " ";
/**
* Columns (i.e. tokens and counts) are separated by this character.
*/
static final String COLUMN_SEPARATOR = "\t";
/**
* When hitting a column separator within a token, it is replaced by this token.
*/
static final String COLUMN_SEP_REPLACEMENT = " ";
/**
* This string (a line) will separate unigrams from bigrams in the output file
**/
static final String NGRAM_SEPARATOR_LINE = "----------------------------------------------------";
static final String NEWLINE_REGEX = "\r\n?|\n";
/**
* The feature path. Default: tokens.
*/
public static final String PARAM_FEATURE_PATH = "featurePath";
@ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = false)
private String featurePath;
private static final String DEFAULT_FEATURE_PATH = Token.class.getCanonicalName();
/**
* Set this parameter if bigrams should only be counted when occurring within a covering type, e.g. sentences.
*/
public static final String PARAM_COVERING_TYPE = "coveringType";
@ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false)
private String coveringType;
/**
* If true, all tokens are lowercased.
*/
public static final String PARAM_LOWERCASE = "lowercase";
@ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false")
private boolean lowercase;
/**
* Tokens occurring fewer times than this value are omitted. Default: 5.
*/
public static final String PARAM_MIN_COUNT = "minCount";
@ConfigurationParameter(name = PARAM_MIN_COUNT, mandatory = true, defaultValue = "5")
private int minCount;
/**
* If true, sort output by count (descending order).
*/
public static final String PARAM_SORT_BY_COUNT = "sortByCount";
@ConfigurationParameter(name = PARAM_SORT_BY_COUNT, mandatory = true, defaultValue = "false")
private boolean sortByCount;
/**
* If true, sort output alphabetically.
*/
public static final String PARAM_SORT_BY_ALPHABET = "sortByAlphabet";
@ConfigurationParameter(name = PARAM_SORT_BY_ALPHABET, mandatory = true, defaultValue = "false")
private boolean sortByAlphabet;
public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
@ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "")
private String stopwordsFile;
public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement";
@ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "")
private String stopwordsReplacement;
public static final String PARAM_FILTER_REGEX = "filterRegex";
@ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "")
private String filterRegex;
public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement";
@ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "")
private String regexReplacement;
private Bag<String> unigrams;
private Bag<String> bigrams;
private StringSequenceGenerator sequenceGenerator;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
if (sortByAlphabet && sortByCount) {
throw new ResourceInitializationException(new IllegalArgumentException(
"Can only sort either by count or alphabetically."));
}
unigrams = new HashBag<>();
bigrams = new HashBag<>();
/* set feature path to default */
if (featurePath == null) {
featurePath = DEFAULT_FEATURE_PATH;
}
/* init sequence generator */
try {
sequenceGenerator = new PhraseSequenceGenerator.Builder()
.featurePath(featurePath)
.coveringType(coveringType)
.lowercase(lowercase)
.stopwordsFile(stopwordsFile)
.stopwordsReplacement(stopwordsReplacement)
.filterRegex(filterRegex)
.filterRegexReplacement(regexReplacement)
.buildStringSequenceGenerator();
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
try {
/* iterate over sequences (e.g. sentences)*/
for (String[] sequence : sequenceGenerator.tokenSequences(aJCas)) {
/* iterate over tokens in sequence */
for (int i = 0; i < sequence.length; i++) {
/* count unigrams */
String unigram = sequence[i]
.replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT)
.replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT);
unigrams.add(unigram);
/* count bigrams */
if (i + 1 < sequence.length) {
String bigram = unigram + BIGRAM_SEPARATOR + sequence[i + 1]
.replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT)
.replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT);
bigrams.add(bigram);
}
}
}
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
}
@Override
public void collectionProcessComplete()
throws AnalysisEngineProcessException
{
getLogger().info("Vocabulary size: " + unigrams.uniqueSet().size());
try {
getLogger().info("Writing frequencies to " + getTargetLocation());
OutputStream os = CompressionUtils.getOutputStream(new File(getTargetLocation()));
writeNgrams(os, unigrams);
os.write((NGRAM_SEPARATOR_LINE + "\n").getBytes());
writeNgrams(os, bigrams);
os.close();
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* Write counter with counts from a bag to an output stream.
*
* @param os an {@link OutputStream}
* @param counter a {@link Bag} of string counter
*/
private void writeNgrams(OutputStream os, Bag<String> counter)
{
/* create token stream */
Stream<String> stream = counter.uniqueSet().stream()
.filter(token -> counter.getCount(token) >= minCount);
/* sort output */
if (sortByAlphabet) {
stream = stream.sorted(String::compareTo);
}
else if (sortByCount) {
stream = stream.sorted((o1, o2) ->
-Integer.compare(counter.getCount(o1), counter.getCount(o2)));
}
/* write tokens with counts */
stream.forEach(token -> {
try {
os.write((token + COLUMN_SEPARATOR + counter.getCount(token) + "\n").getBytes());
}
catch (IOException e) {
throw new RuntimeException(e);
}
});
}
}