/*
* Copyright (c) 2012 GigaSpaces Technologies Ltd. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.openspaces.bigdata.processor;
import static com.google.common.collect.Maps.newHashMap;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.logging.Logger;
import javax.annotation.Resource;
import org.openspaces.bigdata.processor.events.TokenizedTweet;
import org.openspaces.core.GigaSpace;
import org.openspaces.events.EventDriven;
import org.openspaces.events.EventTemplate;
import org.openspaces.events.TransactionalEvent;
import org.openspaces.events.adapter.SpaceDataEvent;
import org.openspaces.events.polling.Polling;
import com.gigaspaces.document.SpaceDocument;
import com.j_spaces.core.client.SQLQuery;
/**
* This polling container processor parses raw tweets, generating TokenizedTweets.
* <p>
* A raw tweet is represented by an instance of SpaceDocument of type "Tweet"
*
* @author Dotan Horovits
*/
@EventDriven
@Polling(gigaSpace = "gigaSpace", concurrentConsumers = 2, maxConcurrentConsumers = 2, receiveTimeout = 60)
@TransactionalEvent(timeout = 100)
public class TweetParser {
private static final Logger log = Logger.getLogger(TweetParser.class.getName());
private static final int MIN_TOKEN_LENGTH = 3;
@Resource(name = "clusteredGigaSpace")
GigaSpace clusteredGigaSpace;
@Resource(name = "gigaSpace")
GigaSpace gigaSpace;
/**
* This method returns a SQL query defining an unprocessed Tweet.
*
* @return {@link SQLQuery} of a {@link SpaceDocument} of type "Tweet"
*/
@EventTemplate
SQLQuery<SpaceDocument> unprocessedTweet() {
return new SQLQuery<SpaceDocument>("Tweet", "Processed = " + false);
}
/**
* Event handler that receives a Tweet instance, processes its text and generates a listing of the tokens appearing in the text and their respective count
* of appearance in the text, instantiates an instance of {@link TokenizedTweet} with this data, and writes it to the space.
*
* @param tweet
* @return {@link TokenizedTweet} containing a mapping of {token->count}
*/
@SpaceDataEvent
public SpaceDocument eventListener(SpaceDocument tweet) {
log.info("parsing tweet " + tweet);
Long id = (Long) tweet.getProperty("Id");
String text = tweet.getProperty("Text");
if (text != null) {
gigaSpace.write(new TokenizedTweet(id, tokenize(text)));
}
tweet.setProperty("Processed", true);
return tweet;
}
protected Map<String, Integer> tokenize(String text) {
Map<String, Integer> tokenMap = newHashMap();
StringTokenizer st = new StringTokenizer(text, "\"{}[]:;|<>?`'.,/~!@#$%^&*()_-+= \t\n\r\f\\");
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.length() < MIN_TOKEN_LENGTH) {
continue;
}
Integer count = tokenMap.containsKey(token) ? tokenMap.get(token) + 1 : 1;
tokenMap.put(token, count);
}
return tokenMap;
}
}