/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.api.frequency.util; import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectSet; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; import com.google.common.collect.MinMaxPriorityQueue; /** * It is basically a mapping from samples (keys) to long values (counts). * * <p>Suppose we want to record the number of occurrences of each word in a sentence, then this class * can be used as follows:</p> * * <blockquote><pre> * FrequencyDistribution<String> fd = new FrequencyDistribution<String>(); * for (String word : "foo bar baz foo".split(" ")) { * fd.inc(word); * } * System.out.println(fd.getCount("foo")); * </pre></blockquote> * * <p>The last call to {@link FrequencyDistribution#getCount} will yield 2, because the * word "foo" has appeared twice in the given sequence of words.</p> * * <p>This class was inspired by NLTK's <a * href="http://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.FreqDist-class.html"> * FreqDist</a>.</p> * * @param <T> * the type of the samples * @see ConditionalFrequencyDistribution */ public class FrequencyDistribution<T> implements Serializable { private static final long serialVersionUID = 150; private Object2LongOpenHashMap<T> freqDist; /** The total number of samples (accumulated count). */ private long n; /** The maximum frequency in the distribution */ private long maxFreq; /** The sample with the maximum frequency in the distribution */ private T maxSample; /** * Creates a new empty {@link FrequencyDistribution}. */ public FrequencyDistribution() { freqDist = new Object2LongOpenHashMap<T>(); n = 0; } /** * Creates a new {@link FrequencyDistribution} prefilled with samples from an {@link Iterable}. * The count for each sample in the iterable is cumulatively increased by 1. * * @param iterable * the {@link Iterable} used to fill the {@link FrequencyDistribution} */ public FrequencyDistribution(Iterable<T> iterable) { this(); incAll(iterable); } /** * Indicates whether this distribution contains outcomes for a given <code>sample</code>. * * @param sample * the sample to look up * @return true if samples exist */ public boolean contains(T sample) { return this.freqDist.containsKey(sample); } /** * Increments the count for a given <code>sample</code>. * * @param sample * the sample to increment the count for */ public void inc(T sample) { addSample(sample, 1); } /** * Increments the count for each sample in a given {@link Iterable}. * * @param iterable * the samples used to increment the counts */ public void incAll(Iterable<T> iterable) { for (T o : iterable) { addSample(o, 1); } } /** * Returns the total number of sample outcomes that have been recorded by this frequency * distribution. This is equal to the accumulated count of all samples (duplicates included). * * @return the total number of sample outcomes */ public long getN() { return n; } /** * Returns the total number of sample values (or bins) that have counts greater than zero. This * is equal to the accumulated counts of all distinct samples (duplicates excluded). * * @return the total number of bins */ public long getB() { return this.freqDist.size(); } /** * Returns the count for a given <code>sample</code>. If no such samples have been recorded yet, * <code>0</code> will be returned. * * @param sample * the sample to get the count for * @return the count for a given sample */ public long getCount(T sample) { if (freqDist.containsKey(sample)) { return freqDist.get(sample); } else { return 0; } } /** * Returns the {@link Set} of sample values (or bins) for which counts have been recorded. * * @return the set of bins */ @SuppressWarnings("unchecked") public Set<T> getKeys() { return this.freqDist.keySet(); } /** * Increases the count for a given <code>sample</code>. * * @param sample * the sample to increase the count for * @param number * the number to increase by */ public void addSample(T sample, long number) { this.n = this.n + number; long sampleFreq = number; if (freqDist.containsKey(sample)) { sampleFreq = freqDist.get(sample) + number; } freqDist.put(sample, sampleFreq); if (sampleFreq > maxFreq) { maxFreq = sampleFreq; maxSample = sample; } } /** * Returns the highest frequency that is currently stored. * * @return highest frequency that is currently stored. */ public long getMaxFreq() { return maxFreq; } /** * Returns the sample which has currently the highest frequency. If there is more than one * sample which share the highest frequency, returns the one that was added first. * * @return the sample which has currently the highest frequency */ public T getSampleWithMaxFreq() { return maxSample; } public void save(File file) throws IOException { ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(file)); out.writeObject(freqDist); out.close(); } public void load(File file) throws IOException, ClassNotFoundException { ObjectInputStream in = new ObjectInputStream(new FileInputStream(file)); freqDist = (Object2LongOpenHashMap<T>) in.readObject(); in.close(); int samples = 0; LongIterator sampleIter = freqDist.values().iterator(); // determine total frequency while (sampleIter.hasNext()) { long count = sampleIter.next(); samples += count; } n = samples; // determine max sample for(T key : freqDist.keySet()){ Long freq = freqDist.get(key); if (freq > maxFreq) { maxFreq = freq; maxSample = key; } } } public void clear() { freqDist.clear(); maxFreq=0; maxSample=null; n = 0; } /** * Returns the n most frequent samples in the distribution. The ordering within in a group of * samples with the same frequency is undefined. * * @param n * the numer of most frequent samples to return. * @return the n most frequent samples in the distribution. */ public List<T> getMostFrequentSamples(int n) { MinMaxPriorityQueue<TermFreqTuple<T>> topN = MinMaxPriorityQueue.maximumSize(n).create(); for (T key : this.getKeys()) { topN.add(new TermFreqTuple<T>(key, this.getCount(key))); } List<T> topNList = new ArrayList<T>(); while (!topN.isEmpty()) { topNList.add(topN.poll().getKey()); } return topNList; } class ValueComparator implements Comparator<T> { Map<T, Long> base; public ValueComparator(Map<T, Long> base) { this.base = base; } @Override public int compare(T a, T b) { if (base.get(a) < base.get(b)) { return 1; } else { return -1; } } } }