package edu.harvard.wcfia.yoshikoder.reporting; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.harvard.wcfia.yoshikoder.dictionary.CategoryNode; import edu.harvard.wcfia.yoshikoder.dictionary.Node; import edu.harvard.wcfia.yoshikoder.dictionary.PatternNode; import edu.harvard.wcfia.yoshikoder.dictionary.SimpleDictionary; import edu.harvard.wcfia.yoshikoder.dictionary.YKDictionary; import edu.harvard.wcfia.yoshikoder.document.tokenizer.Token; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList; public class EntryFrequencyMap { private static Logger log = Logger.getLogger("edu.harvard.wcfia.yoshikoder.reporting.EntryFrequencyMap"); protected Map nodeToCount; protected int tokenTotal; protected CategoryNode currentTopNode; protected Comparator comparator = new Comparator(){ // sort alphabetically on the path public int compare(Object o1, Object o2) { Node n1 = (Node)o1; Node n2 = (Node)o2; String path1 = getEntryPath(n1); String path2 = getEntryPath(n2); return path1.compareTo(path2); } }; public EntryFrequencyMap(CategoryNode node, TokenList tl){ nodeToCount = new HashMap(); tokenTotal = tl.size(); currentTopNode = (CategoryNode)node.getParent(); collectPatterns(node, tl); } public EntryFrequencyMap(YKDictionary dict, TokenList tl){ nodeToCount = new HashMap(); tokenTotal = tl.size(); CategoryNode root = dict.getDictionaryRoot(); collectPatterns(root, tl); } public int getTokenTotal(){ return tokenTotal; } protected void collectPatterns(Node node, TokenList tl){ //log.info("in collectPatterns: node = " + node.getName()); WordFrequencyMap wmap = tl.getWordFrequencyMap(); List<String> words = wmap.getVocabularyList(); if (node instanceof PatternNode){ Pattern pattern = ((PatternNode)node).getPattern(); Matcher matcher = pattern.matcher(""); int count = 0; for (Iterator iter = words.iterator(); iter.hasNext();) { String w = (String) iter.next(); matcher.reset(w); if (matcher.matches()) count += wmap.getWordCount(w); } nodeToCount.put(node, new Integer(count)); //log.info("putting " + node.getName() + " -> " + count); // walk upwards, adding count to every parent Node parent = node; while ((parent = (Node)parent.getParent()) != currentTopNode){ Integer c = (Integer)nodeToCount.get(parent); //log.info("parent " + parent.getName() + " has count " + c); Integer newCount = new Integer(c.intValue() + count); nodeToCount.put(parent, newCount); //log.info("now putting " + node.getName() + " -> " + count); } } else { nodeToCount.put(node, new Integer(0)); //log.info("in category node fork: putting " + node.getName() + " -> " + "0"); } // recurse Enumeration en = node.children(); while (en.hasMoreElements()){ Node n = (Node)en.nextElement(); collectPatterns(n, tl); } } public List getSortedEntries(){ List nl = new ArrayList(nodeToCount.keySet()); Collections.sort(nl, comparator); return nl; } public List getSortedCategoryEntries(){ List nl = getSortedEntries(); List cats = new ArrayList(); for (Iterator iter = nl.iterator(); iter.hasNext();) { Node node = (Node) iter.next(); if (node instanceof CategoryNode) cats.add(node); } return cats; } public Integer getEntryCount(Node dictionaryNode){ return (Integer)nodeToCount.get(dictionaryNode); } public Double getEntryProportion(Node dictionaryNode){ Integer count = getEntryCount(dictionaryNode); if (count != null) if (tokenTotal > 0) return new Double(count.doubleValue() / tokenTotal); else return new Double(0); else return null; } public String getEntryPath(Node n){ Node parent = n; StringBuffer sb = new StringBuffer(); sb.append(n.getName()); while ((parent = (Node)parent.getParent()) != null){ sb.insert(0, parent.getName() + ">"); } return sb.toString(); } /** * Returns null if no score is set and score*count otherwise. * * @param dictionaryNode * @return summed score */ public Double getSummedScore(Node dictionaryNode){ Double score = dictionaryNode.getScore(); if (score != null){ Integer ii = (Integer)nodeToCount.get(dictionaryNode); if (ii == null) return new Double(0); else return new Double(ii.intValue() * score.doubleValue()); } return null; } public String toString(){ return nodeToCount.toString(); } }