/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.jazzy;
import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getCandidatePosition;
import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getChangedWords;
import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getTrigram;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly;
import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable;
/**
* This component assumes that some spell checker has already been applied upstream (e.g. Jazzy).
* It then uses ngram frequencies from a frequency provider in order to rank the provided corrections.
*
*/
public class CorrectionsContextualizer
extends JCasAnnotator_ImplBase
{
private static final String BOS ="<S>";
public final static String FREQUENCY_PROVIDER_RESOURCE = "FrequencyProvider";
@ExternalResource(key = FREQUENCY_PROVIDER_RESOURCE)
private FrequencyCountProvider provider;
protected Map<String,Long> countCache;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
countCache = new HashMap<String,Long>();
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence);
List<String> tokenStrings = JCasUtil.toText(tokens);
for (SpellingAnomaly anomaly : JCasUtil.selectCovered(jcas, SpellingAnomaly.class, sentence)) {
FSArray suggestedActions = anomaly.getSuggestions();
int n = suggestedActions.size();
FSArray newActions = new FSArray(jcas, n + 1);
for (int i=0; i<n; i++) {
SuggestedAction action = (SuggestedAction) suggestedActions.get(i);
List<String> changedWords = getChangedWords(action.getReplacement(), tokenStrings, getCandidatePosition(anomaly, tokens));
double probability = getSentenceProbability(changedWords);
action.setCertainty((float) probability);
newActions.set(i, action);
}
// add the original word as a possibility
// might turn out that it fits in well according to ngram model
SuggestedAction newAction = new SuggestedAction(jcas);
newAction.setReplacement(anomaly.getCoveredText());
newAction.setCertainty((float) getSentenceProbability(tokenStrings));
newActions.set(n, newAction);
anomaly.setSuggestions(newActions);
}
}
}
protected double getSentenceProbability(List<String> words) throws AnalysisEngineProcessException {
double sentenceProbability = 0.0;
if (words.size() < 1) {
return 0.0;
}
long nrOfUnigrams;
try {
nrOfUnigrams = provider.getNrOfTokens();
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
List<String> trigrams = new ArrayList<String>();
// in the google n-grams this is not represented (only single BOS markers)
// but I leave it in place in case we add another n-gram provider
trigrams.add(getTrigram(BOS, BOS, words.get(0)));
if (words.size() > 1) {
trigrams.add(getTrigram(BOS, words.get(0), words.get(1)));
}
for (String trigram : new NGramStringIterable(words, 3, 3)) {
trigrams.add(trigram);
}
// FIXME - implement backoff or linear interpolation
for (String trigram : trigrams) {
long trigramFreq = getNGramCount(trigram);
String[] parts = StringUtils.split(trigram, " ");
String bigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 2), " ");
long bigramFreq = getNGramCount(bigram);
String unigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 1), " ");
long unigramFreq = getNGramCount(unigram);
if (trigramFreq < 1) {
trigramFreq = 1;
}
if (bigramFreq < 1) {
bigramFreq = 1;
}
if (unigramFreq < 1) {
unigramFreq = 1;
}
double trigramProb = Math.log( (double) trigramFreq / bigramFreq);
double bigramProb = Math.log( (double) bigramFreq / unigramFreq);
double unigramProb = Math.log( (double) unigramFreq / nrOfUnigrams);
double interpolated = (trigramProb + bigramProb + unigramProb) / 3.0;
sentenceProbability += interpolated;
}
return Math.exp(sentenceProbability);
}
protected long getNGramCount(String ngram) throws AnalysisEngineProcessException {
if (!countCache.containsKey(ngram)) {
try {
countCache.put(ngram, provider.getFrequency(ngram));
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
return countCache.get(ngram);
}
}