package project.core.mbeans.processing; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.net.URL; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import javax.ejb.Stateful; import javax.naming.InitialContext; import org.jboss.annotation.ejb.RemoteBinding; import org.jboss.annotation.ejb.cache.simple.CacheConfig; import com.swabunga.spell.engine.SpellDictionaryHashMap; import com.swabunga.spell.event.SpellChecker; import edu.mit.jwi.Dictionary; import edu.mit.jwi.IDictionary; import project.client.persistence.Message; import project.core.mbeans.database.ConnectionManagerMysqlImpl; import project.persistence.properties.MessageWithProperties; import project.utils.statistics.impl.Stemmer; @RemoteBinding(jndiBinding = "MessageProcessing") @Stateful @CacheConfig(removalTimeoutSeconds=18000L) public class MessageProcessing extends ConnectionManagerMysqlImpl implements MessageProcessingMBean, Runnable { // // used by tokenizer // private static final String WORD_INPUT_DELIMITERS = "[ \t\r\n:*)(,%^&*$#/~!;.?`'\"-]"; private static final String WORD_DB_DELIMITER = ";"; // // used by frequency counter // private static final String WORD_FREQ_DELIMITER = ":"; // // used by 'bad message' classifier // private static final int MIN_MESSAGE_LENGTH = 10; private static final int MAX_SUGGESTIONS = 5; // // used by dictionary // private static final String STR_DICTIONARY_LOCATION = "/home/alexd/workspace/proiect-diploma/input/dict"; // // used by spell checking // private static final String STR_SPELLCHECKER_LOCATION = "/home/alexd/workspace/proiect-diploma/input/english.0/english.0"; // // used by special word algorithm // private static final String STR_SPECIALWORD_LOCATION = "/home/alexd/workspace/proiect-diploma/config/special-words.in"; private static IDictionary dictionary = null; static { try { // construct the URL to the Wordnet dictionary directory URL url = new URL("file", null, STR_DICTIONARY_LOCATION); dictionary = new Dictionary(url); dictionary.open(); } catch (Exception e) { e.printStackTrace(); } } private List<Message> queue = null; private boolean bRunning = true; private List<String> stopWords = null; private List<String> specialWords = null; private SpellChecker spellChecker = null; private MessageDispatcherMBean dispatcher = null; public void addRawMessage(Message message) { queue.add(message); synchronized (this) { this.notify(); } } public int getQueueLength() { synchronized (queue) { return queue.size(); } } @PostConstruct public void start() { // System.out.println ("MessageProcessing started"); queue = new LinkedList<Message>(); // setJndiName(this.getClass().getSimpleName()); // // load the stopWords stopWords = new LinkedList<String>(); // // load the specialWords specialWords = new LinkedList<String>(); // // load the dispatcher this.initDispatcher(); try { this.setConnectionParams("ebas", "gwtebas", "bachelor_project"); Connection connection = this.getConnection(); String query = "select content from Word where labels like 'stop'"; Statement s0 = connection.createStatement(); ResultSet set = s0.executeQuery(query); while (set.next()) { stopWords.add(set.getString("content")); } set.close(); s0.close(); // // init the spell checker // this.initSpellCheckDictionary(); // // BufferedReader in = new BufferedReader(new FileReader(new File( STR_SPECIALWORD_LOCATION))); String specialWord = null; while ((specialWord = in.readLine()) != null) { specialWord = specialWord.trim(); specialWords.add(specialWord); } in.close(); // // start the thread // new Thread(this).start(); } catch (Exception e) { e.printStackTrace(); } } @PreDestroy public void stop() { bRunning = false; synchronized (this) { this.notify(); } /* * try { unbind (); } catch (Exception e) { e.printStackTrace(); } */ // System.out.println ("MessageProcessing stopped"); } public void run() { while (bRunning) { Message msg = null; if (this.getQueueLength() != 0) { synchronized (queue) { // // get the first message in queue // msg = queue.get(0); queue.remove(0); } } else { // // wait for a message, sleep until then // synchronized (this) { try { this.wait(); } catch (InterruptedException e) { } } } if (msg != null) { // // Process the message // System.out.println("New message for processing:"); System.out.println("\t> url : " + msg.getUrl()); // System.out.println ("\t> content : " + msg.getContent()); MessageWithProperties processedMessageWithProperties = this .processMessage(msg); // System.out.println ("\t> formatted : " + // processedMessageWithProperties.getFormattedContent()); // // advance this message to the dispatcher // // let the dispatcher handle the processed message if (dispatcher != null) { dispatcher.dispatchMessage(processedMessageWithProperties); } else { System.out.println("dispatcher is null !!!"); } } } } /** * Tokenize an input string according to the {@link #WORD_INPUT_DELIMITERS} * * @param input * @return */ private String[] tokenize(String input) { // // lowercase the input // input = input.toLowerCase(); // // replace html markers // input = this.replaceHtmlMarkers(input); // // and return the tokenization // return input.split(WORD_INPUT_DELIMITERS); } /** * Replace the Html markers from input string * * @param input * @return */ private String replaceHtmlMarkers(String input) { input = input.replaceAll("<", "<"); input = input.replaceAll(">", ">"); input = input.replaceAll(""", "\""); input = input.replaceAll("'", "'"); input = input.replaceAll(" ", " "); input = input.replaceAll(" ", " "); return input; } /** * Stem the word (root part of it) * * @param input * @return */ private String stem(String input) { return Stemmer.stem(input); } /** * Find out if a word is stop word or not * * @param input * raw input (unstemmed) * @return */ private boolean isStopWord(String input) { // // first lowercase input = input.toLowerCase(); // // stem it // input = stem (input); // // check stop words list return stopWords.contains(input); } private boolean isSpecialWord(String input) throws IllegalArgumentException { // // first lowercase input = input.toLowerCase(); if (!input.matches("[a-z]*")) throw new IllegalArgumentException("Invalid input string: " + input); Iterator<String> i = specialWords.iterator(); while (i.hasNext()) { String specialWord = i.next(); if (input.contains(specialWord) && input.length() > (specialWord.length() + 2)) return true; } return false; } /** * Given a vector of String objects, match valid words and concatenate them * in a single String. * * @param tokens * @return */ private String accumulateTokens(String[] words) { List<String> validWordsList = new LinkedList<String>(); List<Integer> validWordsPositions = new LinkedList<Integer>(); int index = 0; for (String w : words) { w = w.trim(); if (w.matches("[a-z]*")) { validWordsList.add(w); validWordsPositions.add(index); } index++; } String[] wordsProcessed = validWordsList .toArray(new String[validWordsList.size()]); // // 3.2. and create the final formatted string // index = 0; String strFormattedMessage = ""; for (String w : wordsProcessed) { if (w.length() > 0) { // // if (validWordsPositions.contains(new Integer(index))) { if (this.isStopWord(w)) strFormattedMessage += w + "-stop" + WORD_DB_DELIMITER; else if (this.isSpecialWord(w)) strFormattedMessage += w + "-special" + WORD_DB_DELIMITER; else { String suggestions = this.spellCheck(w); if (suggestions.equals(w)) strFormattedMessage += w + WORD_DB_DELIMITER; else { // // the word is misspelled // // TODO what's next? } } } else { strFormattedMessage += WORD_DB_DELIMITER; } } else { strFormattedMessage += WORD_DB_DELIMITER; } index++; } // // 3.3. clear used memory // wordsProcessed = null; validWordsList.clear(); validWordsPositions.clear(); return strFormattedMessage; } /** * Returns the word frequency, given a formatted input (after it has been * processed) * * @param input * @return */ private Map<String, Integer> getWordFrequency(String input) { Map<String, Integer> frequencies = new HashMap<String, Integer>(); // // String[] tokens = input.split(WORD_DB_DELIMITER); for (String t : tokens) { if (t.length() > 0) { // // only count frequency for words that are not 'stop' or // 'special' // if (t.indexOf("-") == -1) { Integer f = frequencies.get(t); if (f == null) { f = new Integer(1); } else { f = new Integer(f + 1); } frequencies.put(t, f); } } } tokens = null; return frequencies; } /** * Calculate the properties for message * * @param message * @return */ private MessageWithProperties calculateMessageProperties( MessageWithProperties message) { if (message.getFormattedContent().equals("")) { message.setProperty("type", "bad-message"); } else { // // compose the frequency string // String strFrequencies = ""; Map<String, Integer> frequencies = this.getWordFrequency(message .getFormattedContent()); Iterator<String> i = frequencies.keySet().iterator(); while (i.hasNext()) { String key = i.next(); Integer f = frequencies.get(key); strFrequencies += key + WORD_FREQ_DELIMITER + f + WORD_DB_DELIMITER; } message.setProperty("frequencies", strFrequencies); // // calculate number of valid words // message.setProperty("num-words", "" + frequencies.size()); // // compose the stop words string // String strStopWords = ""; String[] tokens = message.getFormattedContent().split( WORD_DB_DELIMITER); for (String t : tokens) { if (t.indexOf("-stop") != -1) { strStopWords += t.substring(0, t.indexOf('-')) + WORD_DB_DELIMITER; } } if (!strStopWords.equals("")) message.setProperty("stop-words", strStopWords); // // compose the special words string // String strSpecialWords = ""; for (String t : tokens) { if (t.indexOf("-special") != -1) { strSpecialWords += t.substring(0, t.indexOf('-')) + WORD_DB_DELIMITER; } } if (!strSpecialWords.equals("")) message.setProperty("special-words", strSpecialWords); } return message; } public MessageWithProperties processMessage(Message message) { MessageWithProperties processedMessage = new MessageWithProperties( message); if (processedMessage.getContent() != null) { if (processedMessage.getContent().length() > MIN_MESSAGE_LENGTH) { // // String[] words = this.tokenize(processedMessage.getContent()); // // processedMessage.setFormattedContent(this .accumulateTokens(words)); // // now calculate properties // processedMessage = this .calculateMessageProperties(processedMessage); } else { // This is not a valid message // System.out .println("\t> invalid message (message's content is too short)"); processedMessage.setContent(""); processedMessage.setFormattedContent(""); } } else { // This is not a valid message // System.out .println("\t> invalid message (message's content is null)"); processedMessage.setContent(""); processedMessage.setFormattedContent(""); } // // return the message return processedMessage; } private void initDispatcher() { if (dispatcher == null) { try { InitialContext context = new InitialContext(); dispatcher = (MessageDispatcherMBean) context .lookup("MessageDispatcher"); } catch (Exception e) { e.printStackTrace(); } } } private void initSpellCheckDictionary() { File dict = new File(STR_SPELLCHECKER_LOCATION); try { spellChecker = new SpellChecker(new SpellDictionaryHashMap(dict)); } catch (Exception e) { e.printStackTrace(); } } private String spellCheck(String input) throws IllegalArgumentException { if (!input.contains(" ")) { if (spellChecker.isCorrect(input)) return input; else { List suggestions = spellChecker.getSuggestions(input, 10); Iterator i = suggestions.iterator(); int idx = 1; String strSuggestions = ""; while (i.hasNext()) { String sugg = i.next().toString(); strSuggestions += sugg; if (idx < MAX_SUGGESTIONS) strSuggestions += ","; idx++; } return strSuggestions; } } else throw new IllegalArgumentException( "spellCheck can only verify single words"); } }