ClassicHarvester.java example

Explorer
loklak_server-master
- src
  - org
    - json
    - loklak
- test
  - org
    - json
      - JSONObjectTest.java
    - loklak
      - data
        ElasticsearchClientTest.java
      - tools
        storage
        JsonDatasetTest.java
        JsonFileTest.java
        JsonMinifierTest.java
        JsonRandomAccessFileTest.java
/**
 *  ClassicHarvester
 *  Copyright 13.11.2015 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *  
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; wo even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *  
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.harvester.strategy;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.eclipse.jetty.util.log.Log;
import org.loklak.api.search.SearchServlet;
import org.loklak.api.search.SuggestServlet;
import org.loklak.data.DAO;
import org.loklak.harvester.PushThread;
import org.loklak.harvester.TwitterScraper;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.QueryEntry;
import org.loklak.objects.ResultList;
import org.loklak.objects.Timeline;
import org.loklak.objects.Timeline.Order;
import org.loklak.tools.DateParser;

public class ClassicHarvester implements Harvester {

    private final int FETCH_RANDOM = 3;
    private final int HITS_LIMIT_4_QUERIES = 20;
    private final int MAX_PENDING = 200; // this could be much larger but we don't want to cache too many of these
    private final int MAX_HARVESTED = 10000; // just to prevent a memory leak with possible OOM after a long time we flush that cache after a while
    private final Random random = new Random(System.currentTimeMillis());
    public final ExecutorService executor = Executors.newFixedThreadPool(1);
    
    private LinkedHashSet<String> pendingQueries = new LinkedHashSet<>();
    private ArrayList<String> pendingContext = new ArrayList<>();
    private Set<String> harvestedContext = new HashSet<>();
    
    private int hitsOnBackend = 1000;

    public void checkContext(Timeline tl, boolean front) {
        for (MessageEntry tweet: tl) {
            for (String user: tweet.getMentions()) checkContext("from:" + user, front);
            for (String hashtag: tweet.getHashtags()) checkContext(hashtag, front);
        }
    }

    public void checkContext(String s, boolean front) {
        if (!front && pendingContext.size() > MAX_PENDING) return; // queue is full
        if (!harvestedContext.contains(s) && !pendingContext.contains(s)) {
            if (front) pendingContext.add(0, s); else pendingContext.add(s);
        }
        while (pendingContext.size() > MAX_PENDING) pendingContext.remove(pendingContext.size() - 1);
        if (harvestedContext.size() > MAX_HARVESTED) harvestedContext.clear();
    }
    
    public int harvest() {
        String backend = DAO.getConfig("backend","http://loklak.org");
        
        if (random.nextInt(100) != 0 && hitsOnBackend < HITS_LIMIT_4_QUERIES && pendingQueries.size() == 0 && pendingContext.size() > 0) {
            // harvest using the collected keys instead using the queries
            int r = random.nextInt((pendingContext.size() / 2) + 1);
            String q = pendingContext.remove(r);
            harvestedContext.add(q);
            Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT, true, true, 400);
            if (tl == null || tl.size() == 0) return -1;
            
            // find content query strings and store them in the context cache
            checkContext(tl, false);
            DAO.log("retrieval of " + tl.size() + " new messages for q = " + q + ", scheduled push; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size());
            return tl.size();
        }
        
        // load more queries if pendingQueries is empty
        if (pendingQueries.size() == 0) {
            try {
                ResultList<QueryEntry> rl = SuggestServlet.suggest(backend, "", "query", Math.min(1000, Math.max(FETCH_RANDOM * 30, hitsOnBackend / 10)), "asc", "retrieval_next", DateParser.getTimezoneOffset(), null, "now", "retrieval_next", FETCH_RANDOM);
                for (QueryEntry qe: rl) {
                    pendingQueries.add(qe.getQuery());
                }
                hitsOnBackend = (int) rl.getHits();
                DAO.log("got " + rl.size() + " suggestions for harvesting from " + hitsOnBackend + " in backend");
                if (hitsOnBackend == 0) {
                    // the backend does not have any new query words for this time.
                    if (pendingContext.size() == 0) {
                        // try to fill the pendingContext using a matchall-query from the cache
                        // http://loklak.org/api/search.json?source=cache&q=
                        Timeline tl = SearchServlet.search(backend, "", Timeline.Order.CREATED_AT, "cache", 100, 0, SearchServlet.backend_hash, 60000);
                        checkContext(tl, false);
                    }
                    // if we still don't have any context, we are a bit helpless and hope that this situation
                    // will be better in the future. To prevent that this is called excessively fast, do a pause.
                    if (pendingContext.size() == 0) try {Thread.sleep(10000);} catch (InterruptedException e) {}
                }
            } catch (IOException e) {
            	Log.getLog().warn(e.getMessage());
                try {Thread.sleep(10000);} catch (InterruptedException e1) {} // if the remote peer is down, throttle down
            }
        }
        
        if (pendingQueries.size() == 0) return -1;
        
        // take one of the pending queries or pending context and load the tweets
        String q = "";
        try {
            q = pendingQueries.iterator().next();
            pendingQueries.remove(q);
            pendingContext.remove(q);
            harvestedContext.add(q);
            Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT, true, false, 400);
            
            if (tl == null || tl.size() == 0) {
                // even if the result is empty, we must push this to the backend to make it possible that the query gets an update
                if (tl == null) tl = new Timeline(Order.CREATED_AT);
                tl.setQuery(q);
                PushThread pushThread = new PushThread(backend, tl);
                DAO.log( "starting push to backend; pendingQueries = " + pendingQueries.size() + ", pendingContext = " +
                        pendingContext.size() + ", harvestedContext = " + harvestedContext.size());
                executor.execute(pushThread);
                return -1;
            }
            
            // find content query strings and store them in the context cache
            checkContext(tl, true);
            
            // if we loaded a pending query, push results to backpeer right now
            tl.setQuery(q);
            PushThread pushThread = new PushThread(backend, tl);
            DAO.log( "starting push to backend; pendingQueries = " + pendingQueries.size() + ", pendingContext = " +
                    pendingContext.size() + ", harvestedContext = " + harvestedContext.size());
            executor.execute(pushThread);
            return tl.size();
        } catch (NoSuchElementException e) {
            // this is a concurrency glitch. just do nothing.
            return -1;
        }
    }

    public void stop() {
        executor.shutdown();
    }
}