/** * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.jetwick.tw; import de.jetwick.data.JTag; import de.jetwick.data.JTweet; import de.jetwick.es.ElasticUserSearch; import de.jetwick.es.ElasticTagSearch; import de.jetwick.util.Helper; import java.util.Collection; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import twitter4j.TwitterStream; /** * fills the tweets queue via twitter searchAndGetUsers (does not cost API calls) * * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net */ public class TweetProducerViaStream extends MyThread implements TweetProducer { private final Logger logger = LoggerFactory.getLogger(getClass()); protected BlockingQueue<JTweet> resultTweets = new LinkedBlockingQueue<JTweet>(); protected TwitterSearch twSearch; protected ElasticTagSearch tagSearch; private long newStreamInterval = 3 * 60 * 1000; private double tweetsPerSecLimit = 0.5; public TweetProducerViaStream() { super("tweet-producer-stream"); } @Override public void setQueue(BlockingQueue<JTweet> packages) { this.resultTweets = packages; } @Override public void run() { TwitterStream stream = null; TwitterStream oldStream = null; // we cannot detect frequency of all terms but detect + remove high frequent disturbers Map<String, Integer> termFreq = new LinkedHashMap<String, Integer>(); while (true) { try { // stream only LESS FREQUENT tags! leave popular tags only for search Collection<String> input = initTags(termFreq); termFreq.clear(); if (input.isEmpty()) { logger.error("No less frequent tags found! Frequency limit:" + tweetsPerSecLimit); if (!myWait(10)) break; continue; } int counter = 0; logger.info("Starting over with " + input.size() + " tags. indexed tweets:" + counter + " tweetsPerSecLimit:" + tweetsPerSecLimit + " " + input); if (stream != null) oldStream = stream; // use a separate collection here to let the listener release when doing garbage collection // (the listener which is added in the streamingTwitter method) BlockingQueue<JTweet> queue = new LinkedBlockingQueue<JTweet>(1000); stream = twSearch.streamingTwitter(input, queue); // shutdown old stream if (oldStream != null) { oldStream.shutdown(); // oldStream.cleanUp(); } long start = System.currentTimeMillis(); while (true) { JTweet tw = queue.take(); String matchingTerm = null; String txt = tw.getLowerCaseText(); for (String term : input) { if (txt.contains(term)) { matchingTerm = term; break; } } resultTweets.put(tw.setFeedSource("from stream:" + matchingTerm)); Integer integ = termFreq.put(matchingTerm, 1); if (integ != null) termFreq.put(matchingTerm, integ + 1); counter++; // UPDATE tags after a while if ((System.currentTimeMillis() - start) > newStreamInterval) break; } } catch (Exception ex) { logger.error("!! Error while getting tweets via streaming API. Waiting and trying again.", ex); if (!myWait(60 * 5)) break; } } logger.info(getName() + " finished"); } @Override public void setTwitterSearch(TwitterSearch tws) { this.twSearch = tws; } public Collection<String> initTags(Map<String, Integer> termFreq) { Map<String, JTag> tags = new LinkedHashMap<String, JTag>(); try { for (JTag tag : tagSearch.findLowFrequent(0, 1000, tweetsPerSecLimit)) { if (tag.getTerm() != null) { // information in index is based on old search data check if 'realtime' tweetsPerSec is also ok Integer counts = termFreq.get(tag.getTerm()); if (counts != null && counts / (newStreamInterval / 1000f) > tweetsPerSecLimit) { logger.info("Detected tag with a too high frequency (based on stream data):" + tag + " stream-counts:" + counts); continue; } int spaces = Helper.countChars(tag.getTerm(), ' '); if (spaces > 7) { logger.info("Skipping term " + tag.getTerm() + " because too many spaces:" + spaces); continue; } if (tag.getTerm().contains(" OR ")) { logger.warn("Hmmh somewhere the OR came into the tag index!?"); continue; } tags.put(tag.getTerm(), tag); } } } catch (Exception ex) { logger.info("Couldn't query tag index", ex); } // TODO further remove overlapping tags like 'wicket' and 'apache wicket' Set<String> input = new LinkedHashSet<String>(); int MAX_TAGS = 400; MAIN: for (JTag t : tags.values()) { String term = t.getTerm(); if (input.size() >= MAX_TAGS) { logger.error("Too many Tags - Cannot further add tags!" + input.size()); break MAIN; } // filter by links only does NOT work!! input.add(term.trim());// + " " + TwitterSearch.LINK_FILTER); } return input; } @Override public void setTagSearch(ElasticTagSearch tagSearch) { this.tagSearch = tagSearch; } @Override public void setUserSearch(ElasticUserSearch userSearch) { throw new UnsupportedOperationException("Not supported yet."); } public void setNewStreamInterval(long millis) { newStreamInterval = millis; } public void setTweetsPerSecLimit(double tweetsPerSecLimit) { this.tweetsPerSecLimit = tweetsPerSecLimit; } }