package project.core.mbeans.processing; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.sql.SQLException; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import javax.ejb.Stateful; import javax.ejb.Stateless; import javax.naming.InitialContext; import org.jboss.annotation.ejb.RemoteBinding; import org.jboss.annotation.ejb.cache.simple.CacheConfig; import com.google.gwt.user.server.rpc.UnexpectedException; import project.client.persistence.Message; import project.core.mbeans.database.ConnectionManagerMysqlImpl; import project.core.persistence.PersistenceLoaderMBean; import project.persistence.properties.MessageWithProperties; import project.utils.statistics.MessageSimilarity; import project.utils.statistics.impl.MessageSimilarity_Impl; @RemoteBinding(jndiBinding="MessageLinkProcessor") @Stateful @CacheConfig(removalTimeoutSeconds=18000L) public class MessageLinkProcessor extends ConnectionManagerMysqlImpl implements MessageLinkProcessorMBean, Runnable { private static final String WORD_DB_DELIMITER = ";"; private static final String WORD_STRENGTH_DELIMITER = ":"; // // used by link calculus // private static final String STR_SPECIALWORD_LINK_LOCATION = "/home/alexd/workspace/proiect-diploma/config/special-words-links.in"; private static final float MIN_SIMILARITY = 0.001f; private static final int MIN_NUM_WORDS = 10; /** * This is the maximum count of similarities saved in database. The measure is taken into * account only if the number of similarities found is greater than this number. <br/> * * <br/> * Because storing space has O(n^2) complexity (where n is the number of messages existing in * database), storing a large number (n >> 1) of messages will bloat the space used, not to mention * slowing down the application. Limiting the similarities saved to this value, a better, O(n) * storing algorithm is achieved. * * <br/> * <br/> * Decreasing this number to <b>1</b> will cause the mechanism to save the most relevant similarity * (best first strategy, since data arrived progressively) between the current message and * all the other (existing already in database) */ private static final int MAX_SIMILARITY_COUNT_SAVE = 20; private List<MessageWithProperties> queue = null; private boolean bRunning = true; // // connection used to load existing messages (and update them) // private PersistenceLoaderMBean loader = null; // // When the messages saved in database reach a couple of hundreds, loading the same // list of previous message over and over (when processing a new message) will take a // lot of time and is extremely inneficient // private List<MessageWithProperties> messages = new LinkedList<MessageWithProperties> (); // TODO = null; private Map<String,Float> specialLinks = null; @PostConstruct public void start() { //System.out.println ("MessageLinkProcessor service started"); //JndiBinder.bind("MessageLinkProcessor", this); try { InitialContext context = new InitialContext (); loader = (PersistenceLoaderMBean) context.lookup("PersistenceLoader"); // // load existing messages (along with their properties) // messages = loader.loadMessagesWithProperties (null); // // queue = new LinkedList<MessageWithProperties> (); // // this.loadSpecialWordLinks(); // // setup the connection this.setConnectionParams("ebas", "gwtebas", "bachelor_project"); // // start the thread // new Thread (this).start(); } catch (Exception e) { e.printStackTrace(); } } @PreDestroy public void stop() { // // messages.clear(); //JndiBinder.unbind("MessageLinkProcessor"); //System.out.println ("MessageLinkProcessor service stopped"); } public void addProcessedMessage(MessageWithProperties message) { queue.add(message); synchronized (this) { this.notify(); } } public int getQueueLength() { synchronized (queue) { return queue.size(); } } public void run() { while (bRunning) { MessageWithProperties msg = null; if (this.getQueueLength() != 0) { synchronized (queue) { // // get the first message in queue // msg = queue.get(0); queue.remove(0); } } else { // // wait for a message, sleep until then // synchronized (this) { try { this.wait(); } catch (InterruptedException e) {} } } if (msg != null) { System.out.println ("New message for link processing : "); System.out.println ("\t> url : " + msg.getUrl()); // // reject bad messages (they should have been dispatched and handled somewhere // else, so only by mistake the arrived here) // if (msg.hasProperty("type") && msg.getProperty("type").equals("bad-message")) { System.out.println ("Bad message!"); continue; } // // process the message (attach more properties) // msg = this.processMessage(msg); // // finally, persist the object // try { loader.insertMessage(msg); // // TODO workaround to get inserted message's row id. Upgrade this later // Message msgTmp = loader.loadMessage(msg.getUrl()); msg.setId(msgTmp.getId()); msgTmp = null; msg.saveProperties(this.getConnection()); // // well, we don't want to load the messages everytime a new one arrives, so just // use this list as storage // messages.add(msg); } catch (SQLException e) { e.printStackTrace(); // TODO log this } } } } public MessageWithProperties processMessage (MessageWithProperties message) { Map<String, Float> links = this.calculateSpecialWordLinks(message); String strLink = ""; // // Iterate through all links found and compose the string // Iterator<String> i = links.keySet().iterator(); while (i.hasNext()) { String url = i.next(); Float strength = links.get(url); strLink += url + WORD_STRENGTH_DELIMITER + strength.floatValue() + WORD_DB_DELIMITER; } message.setProperty("special-link", strLink); Map<String,Float> similarities = this.calculateTextSimilarity(message); String strSimilarity = ""; if (similarities.size() < MAX_SIMILARITY_COUNT_SAVE) { // // Iterate through all similarities found and compose the string // i = similarities.keySet().iterator(); while (i.hasNext()) { String url = i.next(); Float strength = similarities.get(url); strSimilarity += url + WORD_STRENGTH_DELIMITER + strength.floatValue() + WORD_DB_DELIMITER; this.addCrossLinkSimilarity(message,url, strength); } message.setProperty("num-similarities", "" + similarities.size()); } else { // // First, sort out the best similarities // Set<Entry<String,Float>> set = similarities.entrySet(); List<Entry<String,Float>> list = new LinkedList<Entry<String,Float>> (); Iterator<Entry<String,Float>> j = set.iterator(); while (j.hasNext()) { list.add(j.next()); } // // Sort the entries // Collections.sort(list, new Comparator<Entry<String,Float>> () { public int compare (Entry<String,Float> e1, Entry<String,Float> e2) { if (e1.getValue().equals(e2.getValue())) return 0; if (e1.getValue() < e2.getValue()) return 1; return -1; } }); // // and get the most relevant ones // list = list.subList(0, MAX_SIMILARITY_COUNT_SAVE); j = list.iterator(); while (j.hasNext()) { Entry<String,Float> entry = j.next(); String url = entry.getKey(); Float strength = entry.getValue(); strSimilarity += url + WORD_STRENGTH_DELIMITER + strength.floatValue() + WORD_DB_DELIMITER; this.addCrossLinkSimilarity(message,url, strength); } // // clear memory used // list.clear(); // set.clear () (set will be cleared below - #257 - since all elements are actually references) message.setProperty("num-similarities", "" + list.size()); } message.setProperty("similarities", strSimilarity); similarities.clear(); links.clear(); // // print message properties // //System.out.println (message.getProperties()); return message; } private Map<String,Float> calculateSpecialWordLinks (MessageWithProperties message) { Map<String, Float> links = new HashMap<String, Float> (); String strSpecialWords = message.getProperty("special-words"); if (strSpecialWords == null) { // TODO log this // //System.out.println ("The 'special-words' property not found!"); return links; } String[] tokenWords = strSpecialWords.split(WORD_DB_DELIMITER); Iterator<MessageWithProperties> i = messages.iterator(); while (i.hasNext()) { MessageWithProperties other = i.next(); if (other.hasProperty("special-words")) { String strOtherSpecialWords = other.getProperty("special-words"); String[] tokenOthers = strOtherSpecialWords.split(WORD_DB_DELIMITER); float strength = 0.0f; for (int k1 = 0; k1 < tokenWords.length; k1 ++) { for (int k2 = 0; k2 < tokenOthers.length; k2 ++) { String strComposed = tokenWords [k1] + ":" + tokenOthers [k2]; if (specialLinks.containsKey(strComposed)) { strength += specialLinks.get(strComposed); } } } links.put(other.getUrl(), new Float (strength)); } } return links; } private Map<String,Float> calculateTextSimilarity (MessageWithProperties message) { Map<String,Float> similarities = new HashMap<String, Float> (); MessageSimilarity simCalculator = new MessageSimilarity_Impl (); Iterator<MessageWithProperties> i = messages.iterator(); // // see if this message has enough words // if (message.hasProperty("num-words")) { int numWords = Integer.parseInt(message.getProperty("num-words")); if (numWords < MIN_NUM_WORDS) { // // not enough words, do not calculate this message's similarities // return similarities; } } else { // // bad message, too short. return no similarities // return similarities; } while (i.hasNext()) { MessageWithProperties other = i.next(); if (other.hasProperty("num-words")) { int otherWords = Integer.parseInt(other.getProperty("num-words")); if (otherWords < MIN_NUM_WORDS) { // // ignore this message // continue; } } else { // // number of words is unknown. // //System.out.println ("Unknown number of words. Ignoring this message"); continue; } float sim = simCalculator.sim3(message, other); if (sim > MIN_SIMILARITY) { similarities.put(other.getUrl(), sim); } } return similarities; } /** * TODO code this method * @param message * @return */ private Map<String,Float> calculateSynonimSimilarity (MessageWithProperties message) { return null; } private void loadSpecialWordLinks () { try { this.specialLinks = new HashMap<String, Float> (); String line = null; BufferedReader in = new BufferedReader (new FileReader (new File (STR_SPECIALWORD_LINK_LOCATION))); while ( (line = in.readLine()) != null) { String[] tokens = line.split(" "); for (int i = 0; i < tokens.length; i++) { tokens [i] = tokens [i].trim (); } if (tokens.length != 3) { System.out.println ("Invalid line : " + line); continue; } for (String t : tokens) { t = t.trim(); } String strCompLink = tokens [0] + ":" + tokens [1]; Float strength = Float.parseFloat(tokens [2]); specialLinks.put(strCompLink, strength); strCompLink = tokens [1] + ":" + tokens [0]; specialLinks.put(strCompLink, strength); } in.close(); } catch (IOException e) { e.printStackTrace(); //TODO move to log } } /** * Used internally to create cross-links, using similarities, between messages. This method is called * while processing a new message. * @param message * @param otherUrl * @param strength * @throws IllegalStateException */ private void addCrossLinkSimilarity (MessageWithProperties message, String otherUrl, Float strength) throws IllegalStateException { MessageWithProperties other = this.findMessageByUrl(otherUrl); if (other == null) { //System.out.println ("Unable to find message with url : " + otherUrl); TODO move to log return; } if (other.hasProperty("similarities")) { if (other.hasProperty("num-similarities")) { int numSims = Integer.parseInt(other.getProperty("num-similarities")); if (numSims < MAX_SIMILARITY_COUNT_SAVE) { // the simple case, where other message's similarity counts is lower than maximum allowed. // in this case, simply set the new property concatenating message's url and strength // String strProp = other.getProperty("similarities"); strProp += message.getUrl() + WORD_STRENGTH_DELIMITER + strength + WORD_DB_DELIMITER; other.setProperty("similarities", strProp); // also increase the similarity count other.setProperty("num-similarities", "" + (numSims + 1)); } else { // the second case, a bit more complex : the similarity count reached maximum allowed value // and a position must be removed first TODO } } else { // no similarity count. the message was not processed with this code, just throw an exception // throw new IllegalStateException ("Cannot find 'num-similarities' property for message : " + message.getUrl()); } } else { // no similarities, another simple case because this would be the first similarity inserted // String strProp = message.getUrl() + WORD_STRENGTH_DELIMITER + strength + WORD_DB_DELIMITER; other.setProperty("similarities", strProp); // set the similarity count to 1 other.setProperty("num-similarities", "1"); } // at this point, one of the branches above has changed properties for the other message. // persist the object in database (in list the operation is not needed, since java works with handles) try { other.saveProperties(this.getConnection()); } catch (SQLException e) { e.printStackTrace(); // TODO move to log } } /** * Used by {@link #addCrossLinkSimilarity(MessageWithProperties, String, Float)} * @param url * @return */ private MessageWithProperties findMessageByUrl (String url) { Iterator<MessageWithProperties> i = messages.iterator(); while (i.hasNext()) { MessageWithProperties msg = i.next(); if (msg.getUrl().equals(url)) return msg; } return null; } }