package edu.hawaii.jmotif.text; import java.util.Collection; import java.util.HashMap; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; /** * Implements a word bag. Word bag is the container for pairs word-frequency. It is not text, it is * usually a processed text. * * @author psenin * */ public class WordBag { private static final String CR = "\n"; private HashMap<String, AtomicInteger> words; private String label; private int cachedMax; private boolean changed = true; private double cachedAverage; /** * Constructor. * * @param bagLabel The name for the collection. */ public WordBag(String bagLabel) { super(); this.label = bagLabel.substring(0); this.words = new HashMap<String, AtomicInteger>(); } /** * Constructor * * @param bagName The name for the collection. * @param words The words data for the collection. */ public WordBag(String bagName, HashMap<String, Integer> words) { this.label = bagName; this.words = new HashMap<String, AtomicInteger>(); for (Entry<String, Integer> e : words.entrySet()) { this.words.put(e.getKey(), new AtomicInteger(e.getValue())); } } /** * Set the new label on this bag. * * @param newBagLabel The new label. */ public synchronized void setLabel(String newBagLabel) { this.label = newBagLabel; } /** * Get the wordbag id or name. * * @return the label string. */ public synchronized String getLabel() { return this.label; } /** * Add the word into the bag. * * @param word The word to add. */ public synchronized void addWord(String word) { this.changed = true; if (this.words.containsKey(word)) { this.words.get(word).incrementAndGet(); } else { this.words.put(word, new AtomicInteger(1)); } } /** * Add the word into the dictionary. * * @param word The word. * @param frequency Word's frequency. */ public synchronized void addWord(String word, Integer frequency) { this.changed = true; if (this.words.containsKey(word)) { this.words.get(word).set(this.words.get(word).intValue() + frequency); } else { this.words.put(word, new AtomicInteger(frequency)); } } /** * Get the word occurrence frequency, if word is not in returns 0. * * @param word The word to look for. * @return The word frequency. */ public synchronized Integer getWordFrequency(String word) { if (this.words.containsKey(word)) { return this.words.get(word).intValue(); } return 0; } /** * Quick check if the word is in the text. * * @param word The word to check for. * @return True if the word seen in text. */ public synchronized boolean contains(String word) { return this.words.keySet().contains(word); } /** * Get the words set. * * @return The words set. */ public synchronized Collection<String> getWordSet() { return this.words.keySet(); } /** * Get the words collection along with frequencies. * * @return The map of words as keys with frequencies as values. */ public synchronized HashMap<String, Double> getWordsAsDoubles() { HashMap<String, Double> res = new HashMap<String, Double>(); for (Entry<String, AtomicInteger> e : this.words.entrySet()) { res.put(e.getKey(), e.getValue().doubleValue()); } return res; } /** * Get the words collection along with frequencies. * * @return The map of words as keys with frequencies as values. */ public synchronized HashMap<String, Integer> getWords() { HashMap<String, Integer> res = new HashMap<String, Integer>(this.words.size()); for (Entry<String, AtomicInteger> e : this.words.entrySet()) { res.put(e.getKey(), e.getValue().intValue()); } return res; } /** * Get the words collection along with frequencies. * * @return The map of words as keys with frequencies as values. */ public synchronized HashMap<String, AtomicInteger> getInternalWords() { return this.words; } /** * Integral of all frequency values. * * @return sum of all frequency values. */ public synchronized int getTotalWordCount() { int res = 0; for (AtomicInteger count : this.words.values()) { res = res + count.intValue(); } return res; } /** * Implements merge operation for this bag with some other bag. Resulting words set is the union * of words from two bags, and resulting frequencies are the sum of two frequencies. * * @param otherBag The bag to merge with. */ public synchronized void mergeWith(WordBag otherBag) { this.changed = true; for (Entry<String, Integer> entry : otherBag.getWords().entrySet()) if (this.words.containsKey(entry.getKey())) { this.words.get(entry.getKey()).set( words.get(entry.getKey()).intValue() + otherBag.getWordFrequency(entry.getKey()).intValue()); } else { this.words.put(entry.getKey(), new AtomicInteger(entry.getValue())); } } @Override public String toString() { StringBuffer sb = new StringBuffer(); sb.append(this.label).append(CR); for (Entry<String, AtomicInteger> entry : this.words.entrySet()) { sb.append(entry.getKey()).append("\t").append(entry.getValue().intValue()).append(CR); } return sb.toString(); } public String toColumn() { StringBuffer sb = new StringBuffer(); for (Entry<String, AtomicInteger> entry : this.words.entrySet()) { for (int i = 0; i < entry.getValue().intValue(); i++) { sb.append(entry.getKey()).append(CR); } } return sb.toString(); } /** * Get the maximal observed frequency. Useful for normalized tf. * * @return */ public synchronized int getMaxFrequency() { if (changed) { this.cachedMax = 0; for (AtomicInteger num : this.words.values()) { if (this.cachedMax < num.intValue()) { this.cachedMax = num.intValue(); } } this.changed = false; return this.cachedMax; } return this.cachedMax; } public double getAverageFrequency() { if (changed) { int res = 0; for (AtomicInteger num : this.words.values()) { res = res + num.intValue(); } this.cachedAverage = (double) res / (double) this.words.size(); this.changed = false; return this.cachedAverage; } return this.cachedAverage; } }