package edu.berkeley.nlp.mapper; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * Utility Class for parallelizing a collection of items which need to be parallized. To use this you must * extends MapWorker<Item> which will process an instance of Item. When you call doMapping(Iterator<Item>,bufSize), * we take bufSize eleems out of the iterator and distribute the processing of those items, then take bufSize more, * and so on. * * @author aria42 * * @param <Item> */ public class Mapper<Item> { private int numWorkers ; private MapWorkerFactory<Item> factory; public Mapper(MapWorkerFactory<Item> factory) { this.factory = factory; this.numWorkers = Runtime.getRuntime().availableProcessors(); } public Mapper(final Class c) { this(new MapWorkerFactory<Item>() { public MapWorker<Item> newMapWorker() { try { return (MapWorker<Item>) c.newInstance(); } catch (Exception e) { e.printStackTrace(); } return null; } }); } public void setNumWorkers(int numWorkers) { this.numWorkers = numWorkers; } public List<MapWorker<Item>> doMapping(List<Item> items) { List<MapWorker<Item>> workers = new ArrayList<MapWorker<Item>>(); for (int i=0; i < numWorkers; ++i) { MapWorker<Item> worker = factory.newMapWorker(); workers.add(worker); } doMapping(items, workers); return workers; } private void doMapping(List<Item> items, List<MapWorker<Item>> workers) { ExecutorService executor = Executors.newFixedThreadPool(workers.size()); for (int i=0; i < workers.size(); ++i) { int start = (int) ((i/(double) workers.size()) * items.size()); int end = (int) (((i+1)/(double) workers.size()) * items.size()); List<Item> localItems = items.subList(start, end); MapWorker<Item> worker = workers.get(i); worker.setItems(localItems); executor.execute(worker); } execute(executor); for (MapWorker<Item> worker : workers) { worker.reduce(); } } private void execute(ExecutorService executor) { executor.shutdown(); try { executor.awaitTermination(10000, TimeUnit.SECONDS); } catch (InterruptedException e) { e.printStackTrace(); } } public List<MapWorker<Item>> doMapping(Iterator<Item> itemIt) { return doMapping(itemIt, 10000); } public List<MapWorker<Item>> doMapping(Iterator<Item> itemIt, int bufSize) { List<MapWorker<Item>> workers = new ArrayList<MapWorker<Item>>(); int numProcessed = 0; for (int i=0; i < numWorkers; ++i) { MapWorker<Item> worker = factory.newMapWorker(); workers.add(worker); } while (itemIt.hasNext()) { List<Item> items = new ArrayList<Item>(); for (int i=0; i < bufSize; ++i) { if (!itemIt.hasNext()) break; items.add(itemIt.next()); } doMapping(items, workers); System.gc(); numProcessed += bufSize; //System.out.println("[Mapper] done processing " + numProcessed); } return workers; } public Object getNumWorkers() { return numWorkers; } public static void main(String[] args) { class MyMapper extends MapWorker<Integer> { public void map(Integer item) { System.out.println("\tProcessing " + item); } } MapWorkerFactory<Integer> factory = new MapWorkerFactory<Integer>() { public MapWorker<Integer> newMapWorker() { return new MyMapper(); } }; Mapper<Integer> mapper = new Mapper<Integer>(factory); List<Integer> items = new ArrayList<Integer>(); for (int i=0; i < 10000; ++i) { items.add(i); } mapper.doMapping(items.iterator(),10); } }