package edu.harvard.wcfia.yoshikoder.cl; import java.io.File; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import edu.harvard.wcfia.yoshikoder.YKProject; import edu.harvard.wcfia.yoshikoder.dictionary.CategoryNode; import edu.harvard.wcfia.yoshikoder.dictionary.Node; import edu.harvard.wcfia.yoshikoder.dictionary.YKDictionary; import edu.harvard.wcfia.yoshikoder.document.tokenizer.BITokenizerImpl; import edu.harvard.wcfia.yoshikoder.document.tokenizer.Token; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList; import edu.harvard.wcfia.yoshikoder.document.tokenizer.Tokenizer; import edu.harvard.wcfia.yoshikoder.util.FileUtil; import edu.harvard.wcfia.yoshikoder.util.ImportUtil; public class Annotator { protected Map tokenToCategorySet; protected YKDictionary dictionary; protected TokenList tokens; public Annotator(YKDictionary dict, File document) throws Exception { dictionary = dict; tokenToCategorySet = new HashMap(); Tokenizer tl = new BITokenizerImpl(); String txt = FileUtil.slurp(document); tokens = tl.getTokens(txt); Enumeration en = dict.getDictionaryRoot().children(); while (en.hasMoreElements()){ Node n = (Node)en.nextElement(); fillFrom(n); } // System.err.println(tokenToCategorySet); } public String toString(){ StringBuffer sb = new StringBuffer(); for (Iterator iter = tokens.iterator(); iter.hasNext();) { Token token = (Token) iter.next(); sb.append(token.getText()); Set s = (Set)tokenToCategorySet.get(token); if (s != null){ sb.append(" ["); for (Iterator iterator = s.iterator(); iterator.hasNext();) { CategoryNode cat = (CategoryNode) iterator.next(); sb.append(cat.getName() + ","); } sb.delete(sb.length()-1, sb.length()); sb.append("]"); } sb.append("\n"); } return sb.toString(); } protected void fillFrom(Node n){ if (n instanceof CategoryNode){ System.err.println("Processing category " + n.getName()); TokenList tl = dictionary.getMatchingTokens(tokens, n); for (Iterator iter = tl.iterator(); iter.hasNext();) { Token token = (Token) iter.next(); Set s = (Set)tokenToCategorySet.get(token); if (s==null){ s = new HashSet(); tokenToCategorySet.put(token, s); } s.add(n); } Enumeration en = n.children(); while (en.hasMoreElements()){ fillFrom((Node)en.nextElement()); } } } /** * @param args */ public static void main(String[] args) { args = new String[]{"/Users/will/work/cca/LIWC/liwc.ykd", "/Users/will/work/wscores/WordscoresAPSR2_manifestos/UK/FULUKLAB97a.txt"}; // arg 1 is the dictionary or project try { YKDictionary d = null; File f = new File(args[0]); int version = ImportUtil.getVersion(f); if (version == ImportUtil.YKPROJECT_050805_FILE){ YKProject proj = ImportUtil.importYKProject(f); d = proj.getDictionary(); } else if (version == ImportUtil.YKDICTIONARY_050805_FILE){ d = ImportUtil.importYKDictionary(f); } else { throw new Exception("Cannot parse dictionary from '" + f.getCanonicalPath() + "'"); } // arg 2 is the file File doc = new File(args[1]); Annotator annot = new Annotator(d, doc); System.out.println(annot); } catch (Exception ex){ ex.printStackTrace(); System.exit(1); } } }