MessageProcessing.java example

Explorer

similarity-for-message-threads-master
- src
  - project
    - client
    - core
      - crawlers
        ThreadCrawlerImpl.java
      - mbeans
        analysis
        MessageBaseProcessor.java
        MessageBaseProcessorMBean.java
        crawlers
        CMessageThreadRunnable.java
        CrawlerData.java
        CrawlerSettingsData.java
        IMessageThreadRunnable.java
        ThreadCrawler.java
        ThreadCrawlerMBean.java
        database
        ConnectionManager.java
        ConnectionManagerMysqlImpl.java
        ConnectionModule.java
        processing
        MessageDispatcher.java
        MessageDispatcherMBean.java
        MessageLinkProcessor.java
        MessageLinkProcessorMBean.java
        MessageProcessing.java
        MessageProcessingMBean.java
        search
        MessageSeachMBean.java
        MessageSearch.java
      - persistence
        CheckMessageProperties.java
        InsertStopWords.java
        PersistenceLoader.java
        PersistenceLoaderMBean.java
      - servlets
        RpcServlet.java
    - persistence
      - builder
        MessageBoardCrawler.java
        MessageThreadCrawler.java
        impl
        BaseHttpClient.java
        GoogleGroupsMessageBoardCrawler.java
        GoogleGroupsThreadCrawler.java
        MyVisitor.java
        Test.java
        TestSocialNetwork.java
        TestSpelling.java
      - properties
        MessageWithProperties.java
    - utils
      - collocation
        CollocationCumulator.java
        CollocationExtractor.java
        Matrix.java
        WordStatistics.java
        impl
        CollocationCumulator_Impl.java
        CollocationImplVer1.java
      - statistics
        MessageSimilarity.java
        ObjectLooseProperties.java
        WordRank.java
        impl
        MessageSimilarity_Impl.java
        Stemmer.java

package project.core.mbeans.processing;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import javax.ejb.Stateful;
import javax.naming.InitialContext;

import org.jboss.annotation.ejb.RemoteBinding;
import org.jboss.annotation.ejb.cache.simple.CacheConfig;

import com.swabunga.spell.engine.SpellDictionaryHashMap;
import com.swabunga.spell.event.SpellChecker;

import edu.mit.jwi.Dictionary;
import edu.mit.jwi.IDictionary;

import project.client.persistence.Message;
import project.core.mbeans.database.ConnectionManagerMysqlImpl;
import project.persistence.properties.MessageWithProperties;
import project.utils.statistics.impl.Stemmer;

@RemoteBinding(jndiBinding = "MessageProcessing")
@Stateful
@CacheConfig(removalTimeoutSeconds=18000L)
public class MessageProcessing extends ConnectionManagerMysqlImpl implements
		MessageProcessingMBean, Runnable {

	//
	// used by tokenizer
	//
	private static final String WORD_INPUT_DELIMITERS = "[ \t\r\n:*)(,%^&*$#/~!;.?`'\"-]";
	private static final String WORD_DB_DELIMITER = ";";

	//
	// used by frequency counter
	//
	private static final String WORD_FREQ_DELIMITER = ":";

	//
	// used by 'bad message' classifier
	//
	private static final int MIN_MESSAGE_LENGTH = 10;

	private static final int MAX_SUGGESTIONS = 5;

	//
	// used by dictionary
	//
	private static final String STR_DICTIONARY_LOCATION = "/home/alexd/workspace/proiect-diploma/input/dict";

	//
	// used by spell checking
	//
	private static final String STR_SPELLCHECKER_LOCATION = "/home/alexd/workspace/proiect-diploma/input/english.0/english.0";

	//
	// used by special word algorithm
	//
	private static final String STR_SPECIALWORD_LOCATION = "/home/alexd/workspace/proiect-diploma/config/special-words.in";

	private static IDictionary dictionary = null;
	static {

		try {

			// construct the URL to the Wordnet dictionary directory
			URL url = new URL("file", null, STR_DICTIONARY_LOCATION);
			dictionary = new Dictionary(url);
			dictionary.open();

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private List<Message> queue = null;
	private boolean bRunning = true;

	private List<String> stopWords = null;

	private List<String> specialWords = null;

	private SpellChecker spellChecker = null;

	private MessageDispatcherMBean dispatcher = null;

	public void addRawMessage(Message message) {

		queue.add(message);
		synchronized (this) {
			this.notify();
		}
	}

	public int getQueueLength() {

		synchronized (queue) {
			return queue.size();
		}
	}

	@PostConstruct
	public void start() {

		// System.out.println ("MessageProcessing started");
		queue = new LinkedList<Message>();

		// setJndiName(this.getClass().getSimpleName());

		//
		// load the stopWords
		stopWords = new LinkedList<String>();

		//
		// load the specialWords
		specialWords = new LinkedList<String>();

		//
		// load the dispatcher
		this.initDispatcher();

		try {
			this.setConnectionParams("ebas", "gwtebas", "bachelor_project");
			Connection connection = this.getConnection();

			String query = "select content from Word where labels like 'stop'";
			Statement s0 = connection.createStatement();

			ResultSet set = s0.executeQuery(query);
			while (set.next()) {
				stopWords.add(set.getString("content"));
			}

			set.close();
			s0.close();

			//
			// init the spell checker
			//
			this.initSpellCheckDictionary();

			//
			//
			BufferedReader in = new BufferedReader(new FileReader(new File(
					STR_SPECIALWORD_LOCATION)));
			String specialWord = null;
			while ((specialWord = in.readLine()) != null) {
				specialWord = specialWord.trim();
				specialWords.add(specialWord);
			}
			in.close();

			//
			// start the thread
			//
			new Thread(this).start();

		} catch (Exception e) {
			e.printStackTrace();
		}

	}

	@PreDestroy
	public void stop() {

		bRunning = false;
		synchronized (this) {
			this.notify();
		}

		/*
		 * try { unbind (); } catch (Exception e) { e.printStackTrace(); }
		 */

		// System.out.println ("MessageProcessing stopped");
	}

	public void run() {

		while (bRunning) {

			Message msg = null;
			if (this.getQueueLength() != 0) {

				synchronized (queue) {

					//
					// get the first message in queue
					//
					msg = queue.get(0);
					queue.remove(0);

				}
			} else {
				//
				// wait for a message, sleep until then
				//

				synchronized (this) {
					try {
						this.wait();
					} catch (InterruptedException e) {
					}
				}
			}

			if (msg != null) {

				//
				// Process the message
				//

				System.out.println("New message for processing:");
				System.out.println("\t> url       : " + msg.getUrl());
				// System.out.println ("\t> content   : " + msg.getContent());
				MessageWithProperties processedMessageWithProperties = this
						.processMessage(msg);

				// System.out.println ("\t> formatted : " +
				// processedMessageWithProperties.getFormattedContent());

				//
				// advance this message to the dispatcher

				//
				// let the dispatcher handle the processed message
				if (dispatcher != null) {
					dispatcher.dispatchMessage(processedMessageWithProperties);
				} else {
					System.out.println("dispatcher is null !!!");
				}

			}
		}
	}

	/**
	 * Tokenize an input string according to the {@link #WORD_INPUT_DELIMITERS}
	 * 
	 * @param input
	 * @return
	 */
	private String[] tokenize(String input) {

		//
		// lowercase the input
		//
		input = input.toLowerCase();

		//
		// replace html markers
		//
		input = this.replaceHtmlMarkers(input);

		//
		// and return the tokenization
		//
		return input.split(WORD_INPUT_DELIMITERS);
	}

	/**
	 * Replace the Html markers from input string
	 * 
	 * @param input
	 * @return
	 */
	private String replaceHtmlMarkers(String input) {

		input = input.replaceAll("<", "<");
		input = input.replaceAll(">", ">");
		input = input.replaceAll(""", "\"");
		input = input.replaceAll("'", "'");
		input = input.replaceAll("", " ");
		input = input.replaceAll(" ", " ");

		return input;
	}

	/**
	 * Stem the word (root part of it)
	 * 
	 * @param input
	 * @return
	 */
	private String stem(String input) {

		return Stemmer.stem(input);

	}

	/**
	 * Find out if a word is stop word or not
	 * 
	 * @param input
	 *            raw input (unstemmed)
	 * @return
	 */
	private boolean isStopWord(String input) {

		// 
		// first lowercase
		input = input.toLowerCase();

		// 
		// stem it
		// input = stem (input);

		//
		// check stop words list
		return stopWords.contains(input);
	}

	private boolean isSpecialWord(String input) throws IllegalArgumentException {

		//
		// first lowercase
		input = input.toLowerCase();

		if (!input.matches("[a-z]*"))
			throw new IllegalArgumentException("Invalid input string: " + input);

		Iterator<String> i = specialWords.iterator();
		while (i.hasNext()) {

			String specialWord = i.next();
			if (input.contains(specialWord)
					&& input.length() > (specialWord.length() + 2))
				return true;

		}

		return false;
	}

	/**
	 * Given a vector of String objects, match valid words and concatenate them
	 * in a single String.
	 * 
	 * @param tokens
	 * @return
	 */
	private String accumulateTokens(String[] words) {

		List<String> validWordsList = new LinkedList<String>();
		List<Integer> validWordsPositions = new LinkedList<Integer>();
		int index = 0;

		for (String w : words) {

			w = w.trim();
			if (w.matches("[a-z]*")) {

				validWordsList.add(w);
				validWordsPositions.add(index);

			}

			index++;
		}

		String[] wordsProcessed = validWordsList
				.toArray(new String[validWordsList.size()]);

		//
		// 3.2. and create the final formatted string
		//

		index = 0;
		String strFormattedMessage = "";

		for (String w : wordsProcessed) {

			if (w.length() > 0) {

				//
				//
				if (validWordsPositions.contains(new Integer(index))) {

					if (this.isStopWord(w))
						strFormattedMessage += w + "-stop" + WORD_DB_DELIMITER;
					else if (this.isSpecialWord(w))
						strFormattedMessage += w + "-special"
								+ WORD_DB_DELIMITER;
					else {
						String suggestions = this.spellCheck(w);

						if (suggestions.equals(w))
							strFormattedMessage += w + WORD_DB_DELIMITER;
						else {

							//
							// the word is misspelled
							//

							// TODO what's next?

						}
					}

				} else {
					strFormattedMessage += WORD_DB_DELIMITER;
				}
			} else {

				strFormattedMessage += WORD_DB_DELIMITER;
			}

			index++;
		}

		//
		// 3.3. clear used memory
		//
		wordsProcessed = null;
		validWordsList.clear();
		validWordsPositions.clear();

		return strFormattedMessage;
	}

	/**
	 * Returns the word frequency, given a formatted input (after it has been
	 * processed)
	 * 
	 * @param input
	 * @return
	 */
	private Map<String, Integer> getWordFrequency(String input) {

		Map<String, Integer> frequencies = new HashMap<String, Integer>();

		//
		//
		String[] tokens = input.split(WORD_DB_DELIMITER);
		for (String t : tokens) {

			if (t.length() > 0) {

				//
				// only count frequency for words that are not 'stop' or
				// 'special'
				//
				if (t.indexOf("-") == -1) {

					Integer f = frequencies.get(t);
					if (f == null) {
						f = new Integer(1);
					} else {
						f = new Integer(f + 1);
					}

					frequencies.put(t, f);
				}
			}

		}

		tokens = null;
		return frequencies;
	}

	/**
	 * Calculate the properties for message
	 * 
	 * @param message
	 * @return
	 */
	private MessageWithProperties calculateMessageProperties(
			MessageWithProperties message) {

		if (message.getFormattedContent().equals("")) {

			message.setProperty("type", "bad-message");

		} else {

			//
			// compose the frequency string
			//
			String strFrequencies = "";

			Map<String, Integer> frequencies = this.getWordFrequency(message
					.getFormattedContent());
			Iterator<String> i = frequencies.keySet().iterator();

			while (i.hasNext()) {

				String key = i.next();
				Integer f = frequencies.get(key);

				strFrequencies += key + WORD_FREQ_DELIMITER + f
						+ WORD_DB_DELIMITER;
			}
			message.setProperty("frequencies", strFrequencies);

			//
			// calculate number of valid words
			//
			message.setProperty("num-words", "" + frequencies.size());

			//
			// compose the stop words string
			//
			String strStopWords = "";
			String[] tokens = message.getFormattedContent().split(
					WORD_DB_DELIMITER);
			for (String t : tokens) {

				if (t.indexOf("-stop") != -1) {

					strStopWords += t.substring(0, t.indexOf('-'))
							+ WORD_DB_DELIMITER;

				}

			}
			if (!strStopWords.equals(""))
				message.setProperty("stop-words", strStopWords);

			//
			// compose the special words string
			//
			String strSpecialWords = "";
			for (String t : tokens) {

				if (t.indexOf("-special") != -1) {

					strSpecialWords += t.substring(0, t.indexOf('-'))
							+ WORD_DB_DELIMITER;

				}

			}
			if (!strSpecialWords.equals(""))
				message.setProperty("special-words", strSpecialWords);
		}

		return message;
	}

	public MessageWithProperties processMessage(Message message) {

		MessageWithProperties processedMessage = new MessageWithProperties(
				message);

		if (processedMessage.getContent() != null) {
			if (processedMessage.getContent().length() > MIN_MESSAGE_LENGTH) {

				//
				//
				String[] words = this.tokenize(processedMessage.getContent());

				//
				//
				processedMessage.setFormattedContent(this
						.accumulateTokens(words));

				//
				// now calculate properties
				//
				processedMessage = this
						.calculateMessageProperties(processedMessage);

			} else {

				// This is not a valid message
				//
				System.out
						.println("\t> invalid message (message's content is too short)");
				processedMessage.setContent("");
				processedMessage.setFormattedContent("");

			}
		} else {

			// This is not a valid message
			//
			System.out
					.println("\t> invalid message (message's content is null)");
			processedMessage.setContent("");
			processedMessage.setFormattedContent("");

		}

		//
		// return the message
		return processedMessage;
	}

	private void initDispatcher() {
		if (dispatcher == null) {

			try {
				InitialContext context = new InitialContext();
				dispatcher = (MessageDispatcherMBean) context
						.lookup("MessageDispatcher");

			} catch (Exception e) {
				e.printStackTrace();
			}

		}
	}

	private void initSpellCheckDictionary() {
		File dict = new File(STR_SPELLCHECKER_LOCATION);
		try {
			spellChecker = new SpellChecker(new SpellDictionaryHashMap(dict));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private String spellCheck(String input) throws IllegalArgumentException {

		if (!input.contains(" ")) {
			if (spellChecker.isCorrect(input))
				return input;
			else {
				List suggestions = spellChecker.getSuggestions(input, 10);
				Iterator i = suggestions.iterator();
				int idx = 1;
				String strSuggestions = "";
				while (i.hasNext()) {

					String sugg = i.next().toString();
					strSuggestions += sugg;
					if (idx < MAX_SUGGESTIONS)
						strSuggestions += ",";

					idx++;
				}

				return strSuggestions;
			}
		} else
			throw new IllegalArgumentException(
					"spellCheck can only verify single words");
	}
}