/** * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.jetwick.util; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import com.google.inject.Module; import com.google.inject.Provider; import de.jetwick.config.Configuration; import de.jetwick.config.DefaultModule; import de.jetwick.data.DbObject; import de.jetwick.es.ElasticTweetSearch; import de.jetwick.es.ElasticUserSearch; import de.jetwick.rmi.RMIClient; import de.jetwick.es.JetwickQuery; import de.jetwick.data.JUser; import de.jetwick.es.AbstractElasticSearch; import de.jetwick.es.CreateObjectsInterface; import de.jetwick.es.TweetQuery; import de.jetwick.tw.Credits; import de.jetwick.tw.MyTweetGrabber; import de.jetwick.tw.TwitterSearch; import de.jetwick.tw.queue.QueueThread; import java.util.Arrays; import java.util.Date; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.search.facet.terms.TermsFacet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net */ public class Util { private static Logger logger = LoggerFactory.getLogger(Util.class); @Inject private ElasticUserSearch userSearch; @Inject private ElasticTweetSearch tweetSearch; private int userCounter; private Configuration config = new Configuration(); public static void main(String[] args) { Map<String, String> map = Helper.parseArguments(args); Util util = new Util(); String argStr = ""; if (!Helper.isEmpty(map.get("deleteAll"))) { util.deleteAll(); return; } argStr = map.get("fillFrom"); if (!Helper.isEmpty(argStr)) { String fromUrl = argStr; util.fillFrom(fromUrl); return; } argStr = map.get("clearUserTokens"); if (!Helper.isEmpty(argStr)) { String newUserIndexName = argStr; util.clearUserTokens(newUserIndexName); return; } if (!Helper.isEmpty(map.get("copyStaticTweets"))) { util.copyStaticTweets(); return; } argStr = map.get("showFollowers"); if (!Helper.isEmpty(argStr)) { String user = argStr; util.showFollowers(user); return; } if (!Helper.isEmpty(map.get("optimize"))) { util.optimize(); return; } int hitsPerPage = 10000; try { hitsPerPage = Integer.parseInt(map.get("hitsPerPage")); } catch (Exception ex) { } // copyUserIndex=newtwindex argStr = map.get("copyUserIndex"); if (!Helper.isEmpty(argStr)) { String newIndex = argStr; util.copyUserIndex(newIndex, hitsPerPage); return; } // copyUserIndex=newtwindex argStr = map.get("copyTweetIndex"); if (!Helper.isEmpty(argStr)) { String newIndex = argStr; util.copyTweetIndex(newIndex, hitsPerPage); return; } argStr = map.get("removeIndexAndAddAlias"); if (!Helper.isEmpty(argStr)) { logger.warn("use the simple curl script"); return; } argStr = map.get("clearFriendsUpdate"); if (!Helper.isEmpty(argStr)) { String user = argStr; util.clearFriendsUpdate(user); return; } } public Util() { Module module = new DefaultModule(); Guice.createInjector(module).injectMembers(this); } public void deleteAll() { // why don't we need to set? query.setQueryType("simple") userSearch.deleteAll(); userSearch.refresh(); logger.info("Successfully finished deleteAll"); } private void copyStaticTweets() { Module module = new DefaultModule(); Injector injector = Guice.createInjector(module); Provider<RMIClient> rmiProvider = injector.getProvider(RMIClient.class); Configuration cfg = injector.getInstance(Configuration.class); TwitterSearch twSearch = injector.getInstance(TwitterSearch.class); twSearch.initTwitter4JInstance(cfg.getTwitterSearchCredits().getToken(), cfg.getTwitterSearchCredits().getTokenSecret(), true); ElasticTweetSearch fromUserSearch = new ElasticTweetSearch(injector.getInstance(Configuration.class)); JetwickQuery query = new TweetQuery().addFilterQuery(ElasticTweetSearch.UPDATE_DT, "[* TO *]"); // TODO set facetlimit to 2000 query.addFacetField("user").setSize(0); SearchResponse rsp = fromUserSearch.query(query); TermsFacet tf = (TermsFacet) rsp.getFacets().facet("user"); logger.info("found: " + tf.entries().size() + " users with the specified criteria"); int SLEEP = 30; int counter = 0; for (TermsFacet.Entry tmpUser : tf.entries()) { if (tmpUser.getCount() < 20) break; while (twSearch.getRateLimit() <= 3) { try { logger.info("sleeping " + SLEEP + " seconds to avoid ratelimit violation"); Thread.sleep(1000 * SLEEP); } catch (InterruptedException ex) { throw new IllegalStateException(ex); } } logger.info(counter++ + "> feed pipe from " + tmpUser.getTerm() + " with " + tmpUser.getCount() + " tweets"); MaxBoundSet boundSet = new MaxBoundSet<String>(0, 0); // try updating can fail so try max 3 times for (int trial = 0; trial < 3; trial++) { MyTweetGrabber grabber = new MyTweetGrabber().setMyBoundSet(boundSet). init(null, null, tmpUser.getTerm()).setTweetsCount((int) tmpUser.getCount()). setRmiClient(rmiProvider).setTwitterSearch(twSearch); QueueThread pkg = grabber.queueTweetPackage(); Thread t = new Thread(pkg); t.start(); try { t.join(); if (pkg.getException() == null) break; logger.warn(trial + "> Try again feeding of user " + tmpUser.getTerm() + " for tweet package " + pkg); } catch (InterruptedException ex) { logger.warn("interrupted", ex); break; } } } // TODO send via RMI } public void fillFrom(final String fromUrl) { ElasticTweetSearch fromTweetSearch = new ElasticTweetSearch(fromUrl); JetwickQuery query = new TweetQuery(); long maxPage = 1; int hitsPerPage = 300; Set<JUser> users = new LinkedHashSet<JUser>(); Runnable optimizeOnExit = new Runnable() { @Override public void run() { userSearch.refresh(); logger.info(userCounter + " users pushed to default tweet search from " + fromUrl); } }; Runtime.getRuntime().addShutdownHook(new Thread(optimizeOnExit)); for (int page = 0; page < maxPage; page++) { query.attachPagability(page, hitsPerPage); users.clear(); SearchResponse rsp; try { rsp = fromTweetSearch.query(users, query); } catch (Exception ex) { logger.warn("Error while searching!", ex); continue; } if (maxPage == 1) { maxPage = rsp.getHits().getTotalHits() / hitsPerPage + 1; logger.info("Paging though query:" + query.toString()); logger.info("Set numFound to " + rsp.getHits().getTotalHits()); } for (JUser user : users) { userSearch.save(user, false); } userCounter += users.size(); logger.info("Page " + page + " out of " + maxPage + " hitsPerPage:" + hitsPerPage); if (page * hitsPerPage % 100000 == 0) { logger.info("Commit ..."); userSearch.refresh(); } } } public void showFollowers(String user) { // ElasticUserSearch uSearch = createUserSearch(); // Set<SolrUser> jetwickUsers = new LinkedHashSet<SolrUser>(); // uSearch.search(jetwickUsers, new SolrQuery().setRows(10000)); final Set<String> set = new TreeSet<String>(); // for (SolrUser u : jetwickUsers) { // set.add(u.getScreenName()); // } Credits credits = config.getTwitterSearchCredits(); TwitterSearch tw4j = new TwitterSearch().setConsumer(credits.getConsumerKey(), credits.getConsumerSecret()); tw4j.initTwitter4JInstance(credits.getToken(), credits.getTokenSecret(), true); tw4j.getFollowers(user, new AnyExecutor<JUser>() { @Override public JUser execute(JUser o) { // if (set.contains(o.getScreenName())) set.add(o.getScreenName()); return null; } }); for (String u : set) { System.out.println(u); } } public void optimize() { tweetSearch.optimize(); } public <T extends DbObject> void copyIndex(String newIndex, AbstractElasticSearch<T> search, int hitsPerPage) { try { logger.info("Old index has totalhits:" + search.countAll()); if (!search.indexExists(newIndex)) { logger.info("New Index '" + newIndex + "' does not exist! create it before copy!"); return; } logger.info("Now copy from " + search.getIndexName() + " to " + newIndex); search.mergeIndices(Arrays.asList(search.getIndexName()), newIndex, hitsPerPage, true, search, null); search.setIndexName(newIndex); logger.info("New index has totalhits:" + search.countAll() + " Now optimize ..."); search.optimize(); } catch (Exception ex) { logger.error("Exception while copyIndex", ex); } } public void clearUserTokens(String newIndex) { try { logger.info("Old index has totalhits:" + userSearch.countAll()); if (!userSearch.indexExists(newIndex)) { logger.info("New Index '" + newIndex + "' does not exist! create it before copy!"); return; } logger.info("Now copy from " + userSearch.getIndexName() + " to " + newIndex + " and clear user tokens"); userSearch.mergeIndices(Arrays.asList(userSearch.getIndexName()), newIndex, 10000, true, new CreateObjectsInterface<JUser>() { @Override public List<JUser> collectObjects(SearchResponse rsp) { List<JUser> users = userSearch.collectObjects(rsp); for (JUser u : users) { u.setTwitterToken(null); u.setTwitterTokenSecret(null); } return users; } }, null); userSearch.setIndexName(newIndex); logger.info("New index has totalhits:" + userSearch.countAll() + " Now optimize ..."); userSearch.optimize(); } catch (Exception ex) { logger.error("Exception while copyIndex", ex); } } public void clearFriendsUpdate(String userStr) { JUser user = userSearch.findByScreenName(userStr); user.setLastFriendsUpdate(new Date(0)); userSearch.save(user, true); logger.info("stored user:" + user + " collector should update friends immediately"); } private void copyUserIndex(String newIndex, int hitsPerPage) { copyIndex(newIndex, userSearch, hitsPerPage); } private void copyTweetIndex(String newIndex, int hitsPerPage) { copyIndex(newIndex, tweetSearch, hitsPerPage); } }