/**
* Copyright 2013-2015 Pierre Merienne
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.pmerienne.trident.ml.nlp;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import com.google.common.base.Functions;
import com.google.common.collect.Ordering;
public class Vocabulary implements Iterable<String>, Serializable {
private static final long serialVersionUID = 7827671824674205961L;
private TreeMap<String, Integer> wordCounts = new ValueComparableMap<String, Integer>(Ordering.natural().reverse());
private Integer size = 0;
public Vocabulary() {
}
public Vocabulary(List<String> words) {
this.addAll(words);
}
public void add(String word) {
Integer actualCount = this.wordCounts.get(word);
if (actualCount == null) {
actualCount = 1;
} else {
actualCount++;
}
this.wordCounts.put(word, actualCount);
this.size++;
}
public void addAll(List<String> words) {
for (String word : words) {
this.add(word);
}
}
public void limitWords(Integer maxWords) {
String lessFrequentWord;
Integer lowestCount;
while (this.wordCount() > maxWords) {
lessFrequentWord = this.wordCounts.lastKey();
lowestCount = this.wordCounts.remove(lessFrequentWord);
this.size -= lowestCount;
}
}
public Integer count(String word) {
Integer actualCount = this.wordCounts.get(word);
if (actualCount == null) {
actualCount = 0;
}
return actualCount;
}
public Double frequency(String word) {
return this.count(word).doubleValue() / this.size.doubleValue();
}
public Boolean contains(String word) {
return this.wordCounts.containsKey(word);
}
public Integer wordCount() {
return this.wordCounts.size();
}
public Integer totalCount() {
return this.size;
}
@Override
public Iterator<String> iterator() {
return this.wordCounts.keySet().iterator();
}
public Set<String> wordSet() {
return this.wordCounts.keySet();
}
@Override
public String toString() {
return "Vocabulary [size=" + size + ", wordCounts=" + wordCounts + "]";
}
/**
* <pre>
* See <a href=http://stackoverflow.com/questions/109383/how-to-sort-a-mapkey-value-on-the-values-in-java/1283722#comment14899161_1283722>how-to-sort-a-mapkey-value-on-the-values-in-java</a>
* </pre>
*
* @param <K>
* @param <V>
*/
private static class ValueComparableMap<K extends Comparable<K>, V> extends TreeMap<K, V> {
private static final long serialVersionUID = 1476556231893371136L;
// A map for doing lookups on the keys for comparison so we don't get
// infinite loops
private final Map<K, V> valueMap;
ValueComparableMap(final Ordering<? super V> partialValueOrdering) {
this(partialValueOrdering, new HashMap<K, V>());
}
private ValueComparableMap(Ordering<? super V> partialValueOrdering, HashMap<K, V> valueMap) {
super(partialValueOrdering // Apply the value ordering
.onResultOf(Functions.forMap(valueMap)) // On the result of
// getting the value
// for the key from
// the map
.compound(Ordering.natural())); // as well as ensuring that
// the keys don't get
// clobbered
this.valueMap = valueMap;
}
@Override
public V get(Object key) {
return this.valueMap.get(key);
}
@Override
public boolean containsKey(Object key) {
return this.valueMap.containsKey(key);
}
public V put(K k, V v) {
if (valueMap.containsKey(k)) {
// remove the key in the sorted set before adding the key again
super.remove(k);
}
valueMap.put(k, v); // To get "real" unsorted values for the
// comparator
return super.put(k, v); // Put it in value order
}
@Override
public V remove(Object key) {
super.remove(key);
return this.valueMap.remove(key);
}
}
}