package com.zqh.mr;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
public class MapReduceImpl implements MapReduce {
private final int BUCKETS;
ExecutorService executor = null;
Map<Object, Integer> finalResult;
private final Logger log = Logger.getLogger(MapReduceImpl.class);
public MapReduceImpl(int threads) {
BasicConfigurator.configure();
BUCKETS = threads;
}
@Override
public Map<Object, Integer> mapReduce(List<Object> data) {
executor = Executors.newFixedThreadPool(BUCKETS);
finalResult = new HashMap<Object, Integer>();
List<Object[]> buckets = divideIntoBuckets(BUCKETS, data);
for (final Object[] bucket : buckets)
executor.execute(new MapAndReduce(bucket));
executor.shutdown();
while (!executor.isTerminated()) {
}
return finalResult;
}
private List<Map.Entry<Object, Integer>> createIntermidiateMap(
Object[] tokens) {
List<Entry<Object, Integer>> map = new ArrayList<Map.Entry<Object, Integer>>();
for (Object token : tokens) {
map.add(new AbstractMap.SimpleEntry<Object, Integer>(token
.toString(), 1));
}
return map;
}
private Map<Object, Integer> reduce(
List<Entry<Object, Integer>> intermediateMap2) {
log.debug("start reduce");
Map<Object, Integer> output = new HashMap<Object, Integer>();
for (Map.Entry<Object, Integer> pair : intermediateMap2) {
if (output.containsKey(pair.getKey())) {
// need to refresh value - do not use value from pair - it is
// always has value = 1
output.put(pair.getKey(), output.get(pair.getKey()) + 1);
} else {
output.put(pair.getKey(), 1);
}
}
log.debug("finish reduce");
return output;
}
/**
* divide string into buckets with provided size
*
* @param bucketSize
* @return
*/
private List<Object[]> divideIntoBuckets(int bucketSize, List<Object> words) {
log.info("start bucketing");
Object[] tokens = words.toArray();
int chunk = tokens.length / bucketSize;
int rem = tokens.length % chunk;
List<Object[]> list = new ArrayList<Object[]>();
int subTokenPosition = 0;
List<String> subTokenList = new ArrayList<String>();
for (int i = 0; i < tokens.length; i++) {
if (subTokenPosition < chunk) {
subTokenList.add(tokens[i].toString());
subTokenPosition++;
// brilliant architecture: if bucket size == 1, add and rest in
// peace
if (subTokenPosition == tokens.length) {
list.add(subTokenList.toArray());
break;
}
} else {
subTokenPosition = 0;
list.add(subTokenList.toArray());
subTokenList = new ArrayList<String>();
// rewind i
i--;
}
}
if (bucketSize > 2) {
// add rest
String[] subTokensRest = new String[rem];
for (int rest = 0; rest < rem; rest++) {
subTokensRest[rest] = tokens[(tokens.length - 1) - (rest)]
.toString();
}
// do not add if no elements found for rest
if (subTokensRest.length > 0)
list.add(subTokensRest);
}
log.info("finish bucketing, buckets: " + list.size());
return list;
}
@Override
public Map<String, Integer> simpleWordCounting(List<Object> words) {
Object[] tokens = words.toArray();
Map<String, Integer> wordOccurence = new HashMap<String, Integer>();
for (Object token : tokens) {
if (wordOccurence.containsKey(token)) {
Integer occured = (wordOccurence.get(token));
occured++;
wordOccurence.put(token.toString(), occured++);
} else {
wordOccurence.put(token.toString(), 1);
}
}
return wordOccurence;
}
private class MapByValueComparator implements Comparator<Object> {
private Map<Object, Integer> map;
public MapByValueComparator(Map<Object, Integer> map) {
this.map = map;
}
@Override
public int compare(Object key1, Object key2) {
int value1 = map.get(key1);
int value2 = map.get(key2);
int diff = value2 - value1;
if (diff == 0)
return key1.hashCode() - key2.hashCode();
else
return diff;
}
}
@Override
public Map<Object, Integer> sortMap(Map<Object, Integer> mapToSort) {
// sort descending by values
Map<Object, Integer> sortedMap = new TreeMap<Object, Integer>(
new MapByValueComparator(mapToSort));
sortedMap.putAll(mapToSort);
return sortedMap;
}
@Override
public List<Object> readFile(String pathToFile) {
List<Object> words = new ArrayList<Object>();
try {
log.info("read file:" + pathToFile);
File file = new File(pathToFile);
Scanner scan = new Scanner(file);
while (scan.hasNext()) {
String word = scan.next();
word = sanitizeString(word);
words.add(word);
}
log.info("file succesfully read");
} catch (FileNotFoundException e) {
log.error(e.getMessage());
e.printStackTrace();
}
return words;
}
private String sanitizeString(String input) {
StringBuffer buf = new StringBuffer();
buf = new StringBuffer(input.replace(".", ""));
buf = new StringBuffer(buf.toString().replace(",", ""));
buf = new StringBuffer(buf.toString().replace(":", ""));
buf = new StringBuffer(buf.toString().replace(";", ""));
buf = new StringBuffer(buf.toString().replace("!", ""));
buf = new StringBuffer(buf.toString().replace("?", ""));
return buf.toString().toLowerCase();
}
@Override
public void displayMap(Map<Object, Integer> map, int values) {
log.info("---\n display map of size: " + map.size());
for (Map.Entry<Object, Integer> pair : map.entrySet()) {
if (values-- > 0)
log.info(pair.getKey() + " -> " + pair.getValue());
}
}
class MapAndReduce implements Runnable {
private Object[] bucket;
MapAndReduce(Object[] bucket) {
this.bucket = bucket;
}
@Override
public void run() {
log.info("A + start map of bucket " + bucket[0]);
List<Map.Entry<Object, Integer>> map = createIntermidiateMap(bucket);
log.info("A - finished map " + bucket[0] + " with map size = "
+ map.size());
log.info("B + start reduce of " + bucket[0]);
Map<Object, Integer> result = reduce(map);
log.info("B - finished reduce of " + bucket[0] + " with "
+ result.size());
log.info("C + final map size before:: " + finalResult.size());
synchronized (finalResult) {
for (Entry<Object, Integer> entry : result.entrySet()) {
if (finalResult.containsKey(entry.getKey())) {
finalResult.put(
entry.getKey(),
finalResult.get(entry.getKey())
+ entry.getValue());
} else
finalResult.put(entry.getKey(), entry.getValue());
}
}
log.info("C - finalResult after " + finalResult.size());
}
};
@Override
public List<Object> generateData() {
int DATA_SIZE = 1048576;
List<Object> list = new ArrayList<Object>();
int min = 1;
int max = 1000000;
for (int i = 0; i < DATA_SIZE; i++) {
list.add(min + (int) (Math.random() * ((max - min) + 1)));
}
return list;
};
}