/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.ldweb1t;
import static de.tudarmstadt.ukp.dkpro.core.frequency.Web1TProviderBase.BOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable;
/**
* Language detector based on n-gram frequency counts, e.g. as provided by Web1T
*
*
*/
public class LanguageDetectorWeb1T
extends JCasAnnotator_ImplBase
{
/**
* An array of external resources of frequency providers (one for each language that should be detected).
*/
public static final String PARAM_FREQUENCY_PROVIDER_RESOURCES = "frequencyProviders";
@ExternalResource(key = PARAM_FREQUENCY_PROVIDER_RESOURCES, mandatory = true)
private FrequencyCountProvider[] frequencyProviders;
/**
* The minimum n-gram size that should be considered. Default is 1.
*/
public static final String PARAM_MIN_NGRAM_SIZE = "minNGramSize";
@ConfigurationParameter(name = PARAM_MIN_NGRAM_SIZE, mandatory = true, defaultValue = "1")
private int minNGramSize;
/**
* The maximum n-gram size that should be considered. Default is 3.
*/
public static final String PARAM_MAX_NGRAM_SIZE = "maxNGramSize";
@ConfigurationParameter(name = PARAM_MAX_NGRAM_SIZE, mandatory = true, defaultValue = "3")
private int maxNGramSize;
private Map<String,FrequencyCountProvider> providerMap;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
providerMap = new HashMap<String,FrequencyCountProvider>();
for (FrequencyCountProvider provider : frequencyProviders) {
try {
providerMap.put(provider.getLanguage(), provider);
}
catch (Exception e) {
throw new ResourceInitializationException(e);
}
}
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
List<String> words = JCasUtil.toText(JCasUtil.select(jcas, Token.class));
if (words.size() < 1) {
return;
}
List<String> ngrams = new ArrayList<String>();
if (words.size() > 1) {
ngrams.add(getNgram(BOS, words.get(0), words.get(1)));
}
for (String ngram : new NGramStringIterable(words, 1, 3)) {
ngrams.add(ngram);
}
try {
Map<String,Double> langProbs = getLanguageProbabilities(ngrams);
String maxLanguage = "x-unspecified";
double maxLogProb = Double.NEGATIVE_INFINITY;
for (String lang : langProbs.keySet()) {
double prob = langProbs.get(lang);
if (prob > maxLogProb) {
maxLogProb = prob;
maxLanguage = lang;
}
System.out.println(lang + " - " + prob);
}
jcas.setDocumentLanguage(maxLanguage);
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
private Map<String,Double> getLanguageProbabilities(List<String> ngrams)
throws Exception
{
Map<String,Double> langProbs = new HashMap<String,Double>();
for (String lang : providerMap.keySet()) {
FrequencyCountProvider provider = providerMap.get(lang);
long nrOfUnigrams = provider.getNrOfNgrams(1);
long nrOfBigrams = provider.getNrOfNgrams(2);
long nrOfTrigrams = provider.getNrOfNgrams(3);
double textLogProbability = 0.0;
for (String ngram : ngrams) {
long frequency = provider.getFrequency(ngram);
int ngramSize = FrequencyUtils.getPhraseLength(ngram);
long normalization = 1;
int weighting = 1;
if (ngramSize == 1) {
normalization = nrOfUnigrams;
}
else if (ngramSize == 2) {
weighting = 2;
normalization = nrOfBigrams;
}
else if (ngramSize == 3) {
weighting = 4;
normalization = nrOfTrigrams;
}
if (frequency > 0) {
double logProb = Math.log( weighting * ((double) frequency) / normalization );
textLogProbability += logProb;
}
else {
textLogProbability += Math.log( 1.0 / normalization);
}
}
langProbs.put(lang, textLogProbability);
}
return langProbs;
}
private String getNgram(String ...strings) {
return StringUtils.join(strings, " ");
}
}