/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.ivy.util.cli.CommandLine; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.NGramModel; /** * Contains base method for the ranking algorithms * */ public abstract class AbstractRanker implements Ranker { private Finder finder; /** * Empty constructor * * Use setFinder before using this class */ public AbstractRanker() { } public AbstractRanker(Finder aFinder) { finder = aFinder; } public Finder getFinder() { return finder; } /** * Gets the frequency of a Split Element * * @param aWord * a fragment. * @return the frequency. */ protected BigInteger freq(Fragment aWord) { return finder.freq(aWord.getWord()); } /** * Returns the frequency of n-grams that contain both split elements * * @param aWord1 * a fragment. * @param aWord2 * another fragment. * @return the n-gram frequency. */ protected BigInteger freq(Fragment aWord1, Fragment aWord2) { return freq(new String[] { aWord1.getWord(), aWord2.getWord() }); } /** * Returns the frequency for a array of words * * @param aWords * the words. * @return the frequency. */ protected BigInteger freq(String[] aWords) { BigInteger total = BigInteger.valueOf(0l); for (NGramModel gram : finder.find(aWords)) { total = total.add(BigInteger.valueOf(gram.getFreq())); } return total; } public final static String INDEX_OPTION = "luceneIndex"; public final static String LIMIT_OPTION = "limit"; public static int getLimitOption(CommandLine aCmd) { int i = Integer.MAX_VALUE; if (aCmd.hasOption(LIMIT_OPTION)) { i = Integer.valueOf(aCmd.getOptionValue(LIMIT_OPTION)); } return i; } public static String getIndexPathOption(CommandLine aCmd) { return aCmd.getOptionValue(INDEX_OPTION); } @Override public void setFinder(Finder aFinder) { finder = aFinder; } /** * Expects that the splits list contains at least one element and that this is the unsplit word. * * @param aSplits * the splits. * @return the filtered splits. */ public static List<DecompoundedWord> filterAndSort(List<DecompoundedWord> aSplits) { List<DecompoundedWord> filtered = new ArrayList<DecompoundedWord>(); for (DecompoundedWord s : aSplits) { if (!Double.isInfinite(s.getWeight()) && !Double.isInfinite(s.getWeight()) && (s.getWeight() > 0.0)) { filtered.add(s); } } Collections.sort(filtered); if (filtered.isEmpty()) { filtered.add(aSplits.get(0)); } return filtered; } @Override public DecompoundedWord highestRank(DecompoundingTree aTree){ return highestRank(aTree.getRoot(), null); } public abstract DecompoundedWord highestRank(ValueNode<DecompoundedWord> aParent, List<DecompoundedWord> aPath); }