/** * ClassicHarvester * Copyright 13.11.2015 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; wo even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.harvester.strategy; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.NoSuchElementException; import java.util.Random; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.eclipse.jetty.util.log.Log; import org.loklak.api.search.SearchServlet; import org.loklak.api.search.SuggestServlet; import org.loklak.data.DAO; import org.loklak.harvester.PushThread; import org.loklak.harvester.TwitterScraper; import org.loklak.objects.MessageEntry; import org.loklak.objects.QueryEntry; import org.loklak.objects.ResultList; import org.loklak.objects.Timeline; import org.loklak.objects.Timeline.Order; import org.loklak.tools.DateParser; public class ClassicHarvester implements Harvester { private final int FETCH_RANDOM = 3; private final int HITS_LIMIT_4_QUERIES = 20; private final int MAX_PENDING = 200; // this could be much larger but we don't want to cache too many of these private final int MAX_HARVESTED = 10000; // just to prevent a memory leak with possible OOM after a long time we flush that cache after a while private final Random random = new Random(System.currentTimeMillis()); public final ExecutorService executor = Executors.newFixedThreadPool(1); private LinkedHashSet<String> pendingQueries = new LinkedHashSet<>(); private ArrayList<String> pendingContext = new ArrayList<>(); private Set<String> harvestedContext = new HashSet<>(); private int hitsOnBackend = 1000; public void checkContext(Timeline tl, boolean front) { for (MessageEntry tweet: tl) { for (String user: tweet.getMentions()) checkContext("from:" + user, front); for (String hashtag: tweet.getHashtags()) checkContext(hashtag, front); } } public void checkContext(String s, boolean front) { if (!front && pendingContext.size() > MAX_PENDING) return; // queue is full if (!harvestedContext.contains(s) && !pendingContext.contains(s)) { if (front) pendingContext.add(0, s); else pendingContext.add(s); } while (pendingContext.size() > MAX_PENDING) pendingContext.remove(pendingContext.size() - 1); if (harvestedContext.size() > MAX_HARVESTED) harvestedContext.clear(); } public int harvest() { String backend = DAO.getConfig("backend","http://loklak.org"); if (random.nextInt(100) != 0 && hitsOnBackend < HITS_LIMIT_4_QUERIES && pendingQueries.size() == 0 && pendingContext.size() > 0) { // harvest using the collected keys instead using the queries int r = random.nextInt((pendingContext.size() / 2) + 1); String q = pendingContext.remove(r); harvestedContext.add(q); Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT, true, true, 400); if (tl == null || tl.size() == 0) return -1; // find content query strings and store them in the context cache checkContext(tl, false); DAO.log("retrieval of " + tl.size() + " new messages for q = " + q + ", scheduled push; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size()); return tl.size(); } // load more queries if pendingQueries is empty if (pendingQueries.size() == 0) { try { ResultList<QueryEntry> rl = SuggestServlet.suggest(backend, "", "query", Math.min(1000, Math.max(FETCH_RANDOM * 30, hitsOnBackend / 10)), "asc", "retrieval_next", DateParser.getTimezoneOffset(), null, "now", "retrieval_next", FETCH_RANDOM); for (QueryEntry qe: rl) { pendingQueries.add(qe.getQuery()); } hitsOnBackend = (int) rl.getHits(); DAO.log("got " + rl.size() + " suggestions for harvesting from " + hitsOnBackend + " in backend"); if (hitsOnBackend == 0) { // the backend does not have any new query words for this time. if (pendingContext.size() == 0) { // try to fill the pendingContext using a matchall-query from the cache // http://loklak.org/api/search.json?source=cache&q= Timeline tl = SearchServlet.search(backend, "", Timeline.Order.CREATED_AT, "cache", 100, 0, SearchServlet.backend_hash, 60000); checkContext(tl, false); } // if we still don't have any context, we are a bit helpless and hope that this situation // will be better in the future. To prevent that this is called excessively fast, do a pause. if (pendingContext.size() == 0) try {Thread.sleep(10000);} catch (InterruptedException e) {} } } catch (IOException e) { Log.getLog().warn(e.getMessage()); try {Thread.sleep(10000);} catch (InterruptedException e1) {} // if the remote peer is down, throttle down } } if (pendingQueries.size() == 0) return -1; // take one of the pending queries or pending context and load the tweets String q = ""; try { q = pendingQueries.iterator().next(); pendingQueries.remove(q); pendingContext.remove(q); harvestedContext.add(q); Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT, true, false, 400); if (tl == null || tl.size() == 0) { // even if the result is empty, we must push this to the backend to make it possible that the query gets an update if (tl == null) tl = new Timeline(Order.CREATED_AT); tl.setQuery(q); PushThread pushThread = new PushThread(backend, tl); DAO.log( "starting push to backend; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size()); executor.execute(pushThread); return -1; } // find content query strings and store them in the context cache checkContext(tl, true); // if we loaded a pending query, push results to backpeer right now tl.setQuery(q); PushThread pushThread = new PushThread(backend, tl); DAO.log( "starting push to backend; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size()); executor.execute(pushThread); return tl.size(); } catch (NoSuchElementException e) { // this is a concurrency glitch. just do nothing. return -1; } } public void stop() { executor.shutdown(); } }