package com.mozilla.grouperfish.util.loader; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.lang.Thread.UncaughtExceptionHandler; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.RejectedExecutionHandler; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.mozilla.grouperfish.base.StreamTool; import com.mozilla.grouperfish.model.NamedSource; /** Helps loading a remote bagheera installation with documents. */ public class Loader<T extends NamedSource> { private final String baseUrl_; private static Logger log = LoggerFactory.getLogger(Loader.class); /** * @param baseUrl The url to a grouperfish resource to use as destination. * Example: http://localhost:61732/documents/mynamespace */ public Loader(final String baseUrl) { baseUrl_ = baseUrl; } /** * Load a single item into Grouperfish. * Whenever multiple items need to be loaded, clients should make use of {@link #load(Iterable)}. */ public void load(T item) { final List<T> wrapper = new ArrayList<T>(); wrapper.add(item); new InsertTask<T>(baseUrl_, wrapper).run(); } /** * Loads document into Grouperfish using a multithreaded client. Returns the * number of document loaded. */ public int load(Iterable<T> stream) { log.debug("Starting import into map '{}'", baseUrl_); final ExecutorService workers = workers(); // So modulo does not match right away, we set i != 0 int i = 1; List<T> batch = new ArrayList<T>(BATCH_SIZE); for (T item : stream) { batch.add(item); if (i % BATCH_SIZE == 0) { workers.submit(new InsertTask<T>(baseUrl_, batch)); batch = new ArrayList<T>(BATCH_SIZE); } if (i % 5000 == 0) { log.info("Queued {} items into map {}", i, baseUrl_); } ++i; } if (!batch.isEmpty()) { workers.submit(new InsertTask<T>(baseUrl_, batch)); } // Submit will block until it is safe to shut down: shutdownGracefully(workers); return i - 1; } /** * So there is this factory where all workers do is running and then relax * at the pool, and where all clients must wait in a queue. It is a pretty * fun work environment... until everyone gets garbage collected that is. */ private ExecutorService workers() { return new ThreadPoolExecutor( 5, 10, 90, TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(100), new ThreadFactory() { @Override public Thread newThread(final Runnable r) { Thread worker = new Thread(r) { @Override public void run() { super.run(); } }; worker.setUncaughtExceptionHandler(new UncaughtExceptionHandler() { @Override public void uncaughtException(final Thread t, final Throwable e) { log.error("Uncaught exception from bagheera load worker.", e); } }); return worker; } }, new RejectedExecutionHandler() { @Override public void rejectedExecution(Runnable task, ThreadPoolExecutor executor) { try { executor.getQueue().put(task); } catch (InterruptedException e) { throw new RuntimeException(e); } } } ); } private void shutdownGracefully(final ExecutorService pool) { pool.shutdown(); try { if (pool.awaitTermination(120, TimeUnit.SECONDS)) return; pool.shutdownNow(); if (pool.awaitTermination(60, TimeUnit.SECONDS)) return; log.error("Importer pool did not terminate within timeout."); System.exit(1); } catch (InterruptedException e) { pool.shutdownNow(); Thread.currentThread().interrupt(); } } /** Each insert task submits a batch of items */ static class InsertTask<T extends NamedSource> implements Runnable { private static final Charset UTF8 = Charset.forName("UTF8"); private final String baseUrl_; private final List<T> items_; InsertTask(final String baseUrl, final List<T> items) { baseUrl_ = baseUrl; items_ = items; } @Override public void run() { log.trace("Insert task has {} items", items_.size()); if (items_.size() == 0) return; for (T item : items_) { log.trace("Writing '{}' to '{}'", item.name(), baseUrl_ + "/" + item.name()); int retriesLeft = 5; boolean done = false; while (!done && retriesLeft > 0) { final String resource = baseUrl_ + "/" + item.name(); try { final HttpURLConnection conn = (HttpURLConnection) new URL(resource).openConnection(); conn.setRequestMethod("PUT"); conn.setDoInput(true); conn.setDoOutput(true); conn.setUseCaches(false); conn.setRequestProperty("Content-Type", "application/json"); Writer wr = new OutputStreamWriter(conn.getOutputStream(), UTF8); wr.write(item.source()); wr.flush(); wr.close(); final int status = conn.getResponseCode(); if (status >= 200 && status < 400) { log.trace("HTTP response status code: {}", status); } else { log.warn("Putting resource '" + resource + "': HTTP status: {} ({})", status, StreamTool.consume(conn.getErrorStream(), UTF8)); } done = true; } catch (IOException e) { final T from = items_.get(0); final T to = items_.get(items_.size() - 1); log.error(String.format("While inserting batch %s,%s", from.name(), to.name())); log.error("IO Error in importer", e); --retriesLeft; if (retriesLeft == 0) { log.error("No retries left (putting resource '" + resource + "'). Giving up.", e); throw new RuntimeException(e); } } } } } } public static final int BATCH_SIZE = 100; }