/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; import java.math.BigInteger; import java.util.List; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; /** * Mutual informationen based ranking algorithm. See doc folder for more * information * */ public class MutualInformationRanker extends AbstractRanker implements RankerList { /** * Empty constructor * * Use {@link #setFinder(Finder)} before using this class */ public MutualInformationRanker() { } public MutualInformationRanker(Finder aFinder) { super(aFinder); } @Override public DecompoundedWord highestRank(List<DecompoundedWord> aSplits) { return rank(aSplits).get(0); } @Override public List<DecompoundedWord> rank(List<DecompoundedWord> aSplits) { for (DecompoundedWord split : aSplits) { double weight = calcRank(split); if (Double.isInfinite(split.getWeight()) || Double.isNaN(split.getWeight())) { weight = 0.0; } split.setWeight(weight); } return filterAndSort(aSplits); } /** * Calculates the weight for a split */ private float calcRank(DecompoundedWord aSplit) { double total = 0; double count = 0; BigInteger unigramCount = getFinder().getUnigramCount(); if (aSplit.getSplits().size() == 1) { // Entropy for single words Fragment w = aSplit.getSplits().get(0); double p = freq(w).doubleValue() / unigramCount.doubleValue(); return (float) ((-1) * p * Math.log(p)); } // Mutual Information for splits. for (int i = 1; i < aSplit.getSplits().size(); i++) { count++; Fragment w1 = aSplit.getSplits().get(i - 1); Fragment w2 = aSplit.getSplits().get(i); // Look up unigram frequencies first - this is fast and allows us to bail out early BigInteger w1f = freq(w1); if (w1f.equals(BigInteger.ZERO)) { continue; } BigInteger w2f = freq(w2); if (w2f.equals(BigInteger.ZERO)) { continue; } // This is a slow lookup that we only do if the unigram frequencies are greate than 0 double a = freq(w1, w2).multiply(unigramCount).doubleValue(); if (a == 0d) { continue; } // Finally calculate double b = w1f.multiply(w2f).doubleValue(); total += Math.log(a / b); } return (float) (total / count); } /** * Searches a a path throw the tree */ @Override public DecompoundedWord highestRank(ValueNode<DecompoundedWord> aParent, List<DecompoundedWord> aPath) { if (aPath != null) { aPath.add(aParent.getValue()); } List<DecompoundedWord> children = aParent.getChildrenValues(); if (children.size() == 0) { return aParent.getValue(); } children.add(aParent.getValue()); List<DecompoundedWord> result = rank(children); DecompoundedWord best = result.get(0); if (best.equals(aParent.getValue())) { // None of the childs get a better score than the parent return aParent.getValue(); } else { // Find the child node that ranked best and recurse for (ValueNode<DecompoundedWord> split : aParent.getChildren()) { if (best.equals(split.getValue())) { return highestRank(split, aPath); } } } return null; } }