/*
* Copyright (c) 2012 GigaSpaces Technologies Ltd. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.openspaces.bigdata.processor;
import static com.google.common.collect.Maps.newHashMap;
import static com.google.common.collect.Sets.newHashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Logger;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import org.openspaces.bigdata.processor.events.TokenizedTweet;
import org.openspaces.core.GigaSpace;
import org.openspaces.events.EventDriven;
import org.openspaces.events.EventTemplate;
import org.openspaces.events.TransactionalEvent;
import org.openspaces.events.adapter.SpaceDataEvent;
import org.openspaces.events.polling.Polling;
import org.openspaces.events.polling.ReceiveHandler;
import org.openspaces.events.polling.receive.MultiTakeReceiveOperationHandler;
import org.openspaces.events.polling.receive.ReceiveOperationHandler;
/**
* Event polling container processor filters out non-informative tokens, such as prepositions, from non-filtered {@link TokenizedTweet} instances.
*
* @author Dotan Horovits
*/
@EventDriven
@Polling(gigaSpace = "gigaSpace", concurrentConsumers = 2, maxConcurrentConsumers = 2, receiveTimeout = 5000)
@TransactionalEvent
public class TokenFilter {
private static final Logger log = Logger.getLogger(TokenFilter.class.getName());
private static final int BATCH_SIZE = 100;
@Resource(name = "gigaSpace")
GigaSpace gigaSpace;
@PostConstruct
void postConstruct() {
log.info(this.getClass().getName() + " initialized");
}
@ReceiveHandler
ReceiveOperationHandler receiveHandler() {
MultiTakeReceiveOperationHandler receiveHandler = new MultiTakeReceiveOperationHandler();
receiveHandler.setMaxEntries(BATCH_SIZE);
receiveHandler.setNonBlocking(true);
receiveHandler.setNonBlockingFactor(1);
return receiveHandler;
}
/**
* This method returns the template of a non-filtered {@link TokenizedTweet}.
*
* @return template for the event container
*/
@EventTemplate
TokenizedTweet tokenizedNonFilteredTweet() {
TokenizedTweet template = new TokenizedTweet();
template.setFiltered(false);
return template;
}
/**
* Event handler that receives a {@link TokenizedTweet} and filters out non-informative tokens. Filtering is performed using
* {@link #isTokenRequireFilter(String)}
*
* @param tokenizedTweet
* @return the input tokenizedTweet after modifications
*/
@SpaceDataEvent
public TokenizedTweet eventListener(TokenizedTweet tokenizedTweet) {
log.info("filtering tweet " + tokenizedTweet.getId());
Map<String, Integer> tokenMap = newHashMap(tokenizedTweet.getTokenMap());
int numTokensBefore = tokenMap.size();
Iterator<Entry<String, Integer>> it = tokenMap.entrySet().iterator();
while (it.hasNext()) {
Entry<String, Integer> entry = it.next();
if (isTokenRequireFilter(entry.getKey())) {
it.remove();
}
}
int numTokensAfter = tokenMap.size();
tokenizedTweet.setTokenMap(tokenMap);
tokenizedTweet.setFiltered(true);
log.fine("filtered out " + (numTokensBefore - numTokensAfter) + " tokens from tweet " + tokenizedTweet.getId());
return tokenizedTweet;
}
private boolean isTokenRequireFilter(final String token) {
return filterTokensSet.contains(token);
}
private static final Set<String> filterTokensSet = newHashSet("aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "anti",
"around", "as", "at", "before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "but", "by", "concerning", "considering",
"despite", "down", "during", "except", "excepting", "excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of",
"off", "on", "onto", "opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "save", "since", "than", "through", "to",
"toward", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without");
}