// // Copyright 2010 Cinch Logic Pty Ltd. // // http://www.chililog.com // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // package org.chililog.server.common; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; public class TextTokenizer { /** * Returns the singleton instance for this class */ public static TextTokenizer getInstance() { return SingletonHolder.INSTANCE; } /** * SingletonHolder is loaded on the first execution of Singleton.getInstance() or the first access to * SingletonHolder.INSTANCE, not before. * * @see http://en.wikipedia.org/wiki/Singleton_pattern */ private static class SingletonHolder { public static final TextTokenizer INSTANCE = new TextTokenizer(); } /** * */ private TextTokenizer() { } /** * <p> * Tokenizes text to get keywords * </p> * <p> * We use lucene <code>StandardAnalyzer</code> with a bit of spice. We want to break up domain names, class names * and emails so we have to do some extra parsing. * </p> * <p> * Lucene parsing: * <ul> * <li>"email@address.com" = ["email@address", "com"]</li> * <li>"com.chililog.server.common.ChiliLogExceptionTest" = ["com.chililog.server.common", "chililogexceptiontest"]</li> * </ul> * </p> * <p> * We have not used regular expression because it is slow. We have implemented this as a singleton so that in the * future we can allow user customization. * </p> * * @param text * Text to extract keywords * @param maxKeywords * Maximum number of keywords to extract. If < 0, then no limit will be used. * @return Array of keywords * @throws IOException */ public ArrayList<String> tokenize(String text, long maxKeywords) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); if (StringUtils.isEmpty(text) || maxKeywords == 0) { return tokens; } Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); HashMap<String, String> lookup = new HashMap<String, String>(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); StringBuilder sb = new StringBuilder(); TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); while (stream.incrementToken()) { char[] termBuffer = termAttribute.termBuffer(); int length = termAttribute.termLength(); boolean doSplit = true; // Check if we want to split if (Character.isDigit(termBuffer[0])) { doSplit = false; } else { for (int j = 0; j < length; j++) { char c = termBuffer[j]; if (!Character.isLetterOrDigit(c) && c != '.' && c != '@') { doSplit = false; break; } } } if (doSplit) { sb.setLength(0); for (int i = 0; i < length; i++) { char c = termBuffer[i]; if (c == '.' || c == '@') { if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) { return tokens; } sb.setLength(0); } else { sb.append(c); } } // Add last part if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) { return tokens; } } else { // No splitting, just add term if (!addToken(tokens, lookup, termAttribute.term(), maxKeywords)) { return tokens; } } } return tokens; } /** * Adds our token to our collection * * @param tokens * collection of tokens * @param lookup * lookup hashmap for duplicates * @param token * token or term to add to the collection * @param maxKeywords * maximum number of keywords * @return True if it is OK to keep adding tokens, False if no more tokens should be added */ private boolean addToken(ArrayList<String> tokens, HashMap<String, String> lookup, String token, long maxKeywords) { if (!StringUtils.isBlank(token) && !lookup.containsKey(token)) { tokens.add(token); lookup.put(token, null); if (maxKeywords > 0 && tokens.size() >= maxKeywords) { return false; } } return true; } }