/* * Copyright 2009 VoidSearch.com * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.voidsearch.voidbase.quant.feed; import com.voidsearch.voidbase.apps.queuetree.client.QueueTreeClient; import com.voidsearch.voidbase.client.SimpleHttpClient; import com.voidsearch.voidbase.quant.timeseries.NumericalSequence; import com.voidsearch.voidbase.quant.timeseries.SequenceGenerator; import org.apache.commons.lang.StringUtils; import java.util.*; /** * tracks token frequency on underlying stream data */ public class TokenFrequency extends NumericalSequence implements SequenceGenerator { private int MAX_TOKENS_LIMIT = 20; private SimpleHttpClient client; private String url; private HashMap<String, Integer> termFreq; private LinkedList<TokenEntry> topTerms; private HashMap<String, Integer> currentFreq; private QueueTreeClient queueClient = new QueueTreeClient(); String tokenDelimiter = "\t"; public class TokenEntry implements Comparable { String token; Integer count = 0; public TokenEntry(String token, int count) { this.token = token; this.count = count; } public void increment() { count++; } public String getToken() { return token; } public Integer getCount() { return count; } public boolean equals(Object o1) { return token.equals(((TokenEntry) o1).getToken()); } public int compareTo(Object o1) { return count.compareTo(((TokenEntry) o1).getCount()); } public String toString() { return token + "\t" + count; } public int hashCode() { return token.hashCode(); } } public TokenFrequency(String requestURL) { init(requestURL); } public TokenFrequency(String requestURL, String tokenDelimiter) { this.tokenDelimiter = tokenDelimiter; init(requestURL); } private void init(String requestURL) { try { client = new SimpleHttpClient(); url = requestURL; termFreq = new HashMap<String, Integer>(); topTerms = new LinkedList<TokenEntry>(); } catch (Exception e) { e.printStackTrace(); } } /** * generate next sequence element * * @return */ public double next() { try { dumpTopTerms(); byte[] result = client.get(url); StringTokenizer tokenizer = new StringTokenizer(new String(result)); int tokenCount = 0; currentFreq = new HashMap<String, Integer>(); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (isValidToken(token)) { int count = 1; if (termFreq.containsKey(token)) { count = (Integer) termFreq.get(token) + 1; } termFreq.put(token, count); topTerms.add(new TokenEntry(token, count)); tokenCount++; // update local if (currentFreq.containsKey(token)) { currentFreq.put(token, currentFreq.get(token) + 1); } else { currentFreq.put(token, 1); } } } return tokenCount; } catch (Exception e) { e.printStackTrace(); } return 0; } /** * cleanup frequency tree * invoked periodically on next() call */ private void cleanupTree() { } private void dumpTopTerms() throws Exception { int termCnt = 0; Collections.sort(topTerms); HashSet<String> displaySet = new HashSet<String>(); for (int i = topTerms.size() - 1; i >= 0; i--) { TokenEntry entry = topTerms.get(i); if (!displaySet.contains(entry.getToken())) { if (termCnt++ <= MAX_TOKENS_LIMIT) { System.out.println(entry + "\t" + currentFreq.get(entry.getToken())); // handle queue entry Integer freq = 0; if (currentFreq.containsKey(entry.getToken())) { freq = currentFreq.get(entry.getToken()); } queueClient.create(entry.getToken(), 1000); queueClient.put(entry.getToken(), freq.toString()); displaySet.add(entry.getToken()); } else { // cleanup non-active queues queueClient.delete(entry.getToken()); } } } } /** * check whether token is valid * * @param token * @return */ private boolean isValidToken(String token) { // filter empty strings //if (token.isEmpty()) { if ((token == null) || (token.length() == 0)) { return false; } // filter html tags if (token.contains(">") || token.contains("<")) { return false; } // filter min lenght if (token.length() < 5) { return false; } if (!StringUtils.isAlpha(token)) { return false; } return true; } }