package edu.harvard.wcfia.yoshikoder.document.tokenizer; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; import java.util.Locale; import java.util.Properties; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import edu.harvard.wcfia.yoshikoder.document.YKDocument; /** * An object that tokenizes documents on the basis of their locale. * @author will * */ public class TokenizationService { private static Logger log = Logger.getLogger("edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationService"); private static TokenizationService ts; protected File pluginsDir; protected TokenizerSource tokenizerSource; protected Set tokenizerMetadata; private TokenizationService(){ Properties props = new Properties(); try { InputStream str = TokenizationService.class.getClassLoader().getResourceAsStream("plugins.properties"); props.load(str); } catch (IOException ioe){ log.log(Level.WARNING, "Couldn't get tokenizer plugins directory from plugins.properties", ioe); } String dir = props.getProperty("tokenizer.plugins.dir", ".yoshikoder/plugins/tokenizers"); pluginsDir = new File(System.getProperty("user.home"), dir); log.info("Making a new TokenizerSource with pluginDir: " + pluginsDir); tokenizerSource = new TokenizerSource(pluginsDir); log.info("Getting available tokenizer plugins from tokenizer source"); Set md = tokenizerSource.getAvailableTokenizerPlugins(); log.info("Initializing tokenizerMetadata with the sources available plugins"); tokenizerMetadata = new HashSet(md); } public static TokenizationService getTokenizationService(){ if (ts == null) ts = new TokenizationService(); return ts; } protected Tokenizer getTokenizer(Locale loc) { try { Tokenizer tok = tokenizerSource.getTokenizerPlugin(loc); if (tok != null) return tok; } catch (PluginException ple){ log.log(Level.WARNING, "Failed to load existing plugin tokenizer. " + "Falling back to BITokenizer", ple); } return new BITokenizerImpl(loc); } public TokenList tokenize(YKDocument doc) throws TokenizationException, IOException { Locale loc = doc.getLocale(); if (loc == null) loc = Locale.getDefault(); Tokenizer tok = getTokenizer(loc); String text = doc.getText(); TokenList tl = tok.getTokens(text); return tl; } public TM addTokenizerPlugin(File f) throws PluginException { log.info("Calling addTokenizerPlugin in TokenizationService"); TM metadatum = tokenizerSource.addTokenizerPlugin(f); log.info("Adding the metadata handed back to the tokenizerMetadata"); tokenizerMetadata.add(metadatum); log.info("returning the metadata"); return metadatum; } public TM replaceTokenizerPlugin(TM existingPlugin, File newPlugin) throws PluginException{ TM latest = tokenizerSource.replaceTokenizerPlugin(existingPlugin, newPlugin); tokenizerMetadata.remove(existingPlugin); tokenizerMetadata.add(latest); // should be the same return latest; } public void removeTokenizerPlugin(TM metadatum){ tokenizerSource.removeTokenizerPlugin(metadatum); tokenizerMetadata.remove(metadatum); } public Set getTokenizerPluginMetadata(){ log.info("returning the plugin metadata from TokenizationService"); return tokenizerMetadata; } public static void main(String[] args) throws Exception { // clear the plugins dir first... File f = new File(System.getProperty("user.home"), ".yoshikoder/plugins/tokenizers"); File[] contents = f.listFiles(); for (int ii=0; ii<contents.length; ii++){ contents[ii].delete(); } System.out.println("CASE 1: No duplicates, empty plugins directory"); TokenizationService service = TokenizationService.getTokenizationService(); TM tm = service.addTokenizerPlugin(new File("/Users/will/java/chinese-tokenizers/SCTokenizer.jar")); System.out.println(tm); service.removeTokenizerPlugin(tm); contents = f.listFiles(); for (int ii=0; ii<contents.length; ii++){ System.out.println(contents[ii] + "should not exist..."); } System.out.println("CASE 2: One duplicate, should throw a Duplicate Exception"); tm = service.addTokenizerPlugin(new File("/Users/will/java/chinese-tokenizers/SCTokenizer.jar")); try { service.addTokenizerPlugin(new File("/Users/will/java/chinese-tokenizers/SCTokenizer.jar")); } catch (DuplicatePluginException de){ log.info("DUPLICATE EXCEPTION!"); } System.out.println("CASE 3: One duplicate, replaced"); TM tmdup = null; TM newdup = null; try { tmdup = service.addTokenizerPlugin(new File("/Users/will/java/chinese-tokenizers/SCTokenizer.jar")); } catch (DuplicatePluginException dp){ log.info("Replacing plugin"); newdup = service.replaceTokenizerPlugin(tm, new File("/Users/will/java/chinese-tokenizers/SCTokenizer.jar")); } service.removeTokenizerPlugin(newdup); contents = f.listFiles(); for (int ii=0; ii<contents.length; ii++){ System.out.println(contents[ii] + "should not exist..."); } } }