package fj.demo.concurrent;
import fj.F;
import fj.Monoid;
import fj.P;
import fj.P1;
import fj.P2;
import fj.control.parallel.ParModule;
import fj.control.parallel.Promise;
import fj.control.parallel.Strategy;
import fj.data.IOFunctions;
import fj.data.Iteratee.Input;
import fj.data.Iteratee.IterV;
import fj.data.List;
import fj.data.Option;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import static fj.Monoid.monoid;
import static fj.control.parallel.ParModule.parModule;
import static fj.data.List.nil;
import static java.util.concurrent.Executors.newFixedThreadPool;
/**
* Reads words and their counts from files ({@link #getWordsAndCountsFromFiles} in a single thread
* and {@link #getWordsAndCountsFromFilesInParallel} in multiple threads). The files are created
* initially and populated with some sample content.
*
* @author Martin Grotzke
*/
public class WordCount {
// Integers.add.f(1) caused an SOE...
private static final F<Integer,Integer> addOne = a -> a.intValue() + 1;
private static <K, V> Map<K, V> update(Map<K, V> map, K key, F<V, V> valueFunction,
V initialValue) {
V value = map.get(key);
if(value == null) {
value = initialValue;
}
map.put(key, valueFunction.f(value));
return map;
}
private static final F<String, Map<String, Integer>> fileNameToWordsAndCountsWithCharChunkIteratee = fileName -> {
try {
return IOFunctions.enumFileCharChunks(new File(fileName), Option.none(), wordCountsFromCharChunks()).run().run();
} catch (final IOException e) {
throw new RuntimeException(e);
}
};
private static final F<String, Map<String, Integer>> fileNameToWordsAndCountsWithCharChunk2Iteratee = fileName -> {
try {
return IOFunctions.enumFileChars(new File(fileName), Option.none(), wordCountsFromChars()).run().run();
} catch (final IOException e) {
throw new RuntimeException(e);
}
};
private static final F<String, Map<String, Integer>> fileNameToWordsAndCountsWithCharIteratee = fileName -> {
try {
return IOFunctions.enumFileChars(new File(fileName), Option.none(), wordCountsFromChars()).run().run();
} catch (final IOException e) {
throw new RuntimeException(e);
}
};
/** An iteratee that consumes char chunks and calculates word counts */
public static final <E> IterV<char[], Map<String, Integer>> wordCountsFromCharChunks() {
final F<P2<StringBuilder,Map<String, Integer>>, F<Input<char[]>, IterV<char[], Map<String, Integer>>>> step =
new F<P2<StringBuilder,Map<String, Integer>>, F<Input<char[]>, IterV<char[], Map<String, Integer>>>>() {
final F<P2<StringBuilder,Map<String, Integer>>, F<Input<char[]>, IterV<char[], Map<String, Integer>>>> step = this;
@Override
public F<Input<char[]>, IterV<char[], Map<String, Integer>>> f(final P2<StringBuilder,Map<String, Integer>> acc) {
final P1<IterV<char[], Map<String, Integer>>> empty =
P.lazy(() -> IterV.cont(step.f(acc)));
final P1<F<char[], IterV<char[], Map<String, Integer>>>> el =
new P1<F<char[], IterV<char[], Map<String, Integer>>>>() {
@Override
public F<char[], IterV<char[], Map<String, Integer>>> _1() {
return e -> {
StringBuilder sb = acc._1();
Map<String, Integer> map = acc._2();
for(char c : e) {
if(Character.isWhitespace(c)) {
if(sb.length() > 0) {
map = update(map, sb.toString(), addOne, Integer.valueOf(0));
sb = new StringBuilder();
}
}
else {
sb.append(c);
}
}
return IterV.cont(step.f(P.p(sb, map)));
};
}
};
final P1<IterV<char[], Map<String, Integer>>> eof =
P.lazy(() -> {
final StringBuilder sb = acc._1();
if(sb.length() > 0) {
final Map<String, Integer> map = update(acc._2(), sb.toString(), addOne, Integer.valueOf(0));
return IterV.done(map, Input.eof());
}
return IterV.done(acc._2(), Input.eof());
});
return s -> s.apply(empty, el, eof);
}
};
return IterV.cont(step.f(P.p(new StringBuilder(), (Map<String, Integer>)new HashMap<String, Integer>())));
}
/** An iteratee that consumes chars and calculates word counts */
public static final <E> IterV<Character, Map<String, Integer>> wordCountsFromChars() {
final F<P2<StringBuilder,Map<String, Integer>>, F<Input<Character>, IterV<Character, Map<String, Integer>>>> step =
new F<P2<StringBuilder,Map<String, Integer>>, F<Input<Character>, IterV<Character, Map<String, Integer>>>>() {
final F<P2<StringBuilder,Map<String, Integer>>, F<Input<Character>, IterV<Character, Map<String, Integer>>>> step = this;
@Override
public F<Input<Character>, IterV<Character, Map<String, Integer>>> f(final P2<StringBuilder,Map<String, Integer>> acc) {
final P1<IterV<Character, Map<String, Integer>>> empty = P.lazy(() -> IterV.cont(step.f(acc)));
final P1<F<Character, IterV<Character, Map<String, Integer>>>> el =
P.lazy(() -> e -> {
if(Character.isWhitespace(e.charValue())) {
final StringBuilder sb = acc._1();
if(sb.length() > 0) {
final Map<String, Integer> map = update(acc._2(), sb.toString(), addOne, Integer.valueOf(0));
return IterV.cont(step.f(P.p(new StringBuilder(), map)));
}
else {
// another whitespace char, no word to push to the map
return IterV.cont(step.f(acc));
}
}
else {
acc._1().append(e);
return IterV.cont(step.f(acc));
}
});
final P1<IterV<Character, Map<String, Integer>>> eof = P.lazy(() -> {
final StringBuilder sb = acc._1();
if(sb.length() > 0) {
final Map<String, Integer> map = update(acc._2(), sb.toString(), addOne, Integer.valueOf(0));
return IterV.done(map, Input.eof());
}
return IterV.done(acc._2(), Input.eof());
}
);
return s -> s.apply(empty, el, eof);
}
};
return IterV.cont(step.f(P.p(new StringBuilder(), (Map<String, Integer>)new HashMap<String, Integer>())));
}
public static void main(String[] args) throws IOException {
// setup
int numFiles = 1;
int numSharedWords = 5000000;
final P2<List<String>, Map<String, Integer>> result = writeSampleFiles(numFiles, numSharedWords);
final List<String> fileNames = result._1();
final Map<String, Integer> expectedWordsAndCounts = result._2();
long avgSize = fileNames.foldLeft((a, file) -> a.longValue() + new File(file).length(), 0l) / fileNames.length();
System.out.println("Processing " + numFiles + " files with ~"+numSharedWords+" words and an avg size of " + avgSize + " bytes.");
// warmup
for(int i = 0; i < 1; i++) {
// getWordsAndCountsFromFiles(fileNames.take(1)).size();
getWordsAndCountsFromFilesWithIteratee(fileNames.take(1), fileNameToWordsAndCountsWithCharIteratee);
getWordsAndCountsFromFilesWithIteratee(fileNames.take(1), fileNameToWordsAndCountsWithCharChunkIteratee);
getWordsAndCountsFromFilesWithIteratee(fileNames.take(1), fileNameToWordsAndCountsWithCharChunk2Iteratee);
getWordsAndCountsFromFilesWithIteratee(fileNames.take(1), fileNameToWordsAndCountsWithCharChunk2Iteratee);
// getWordsAndCountsFromFilesInParallel(fileNames.take(1), fileNameToWordsAndCounts, 8);
getWordsAndCountsFromFilesInParallel(fileNames.take(1), fileNameToWordsAndCountsWithCharIteratee, 8);
getWordsAndCountsFromFilesInParallel(fileNames.take(1), fileNameToWordsAndCountsWithCharChunkIteratee, 8);
}
System.gc();
// get word counts sequentially / single threaded
long start = System.currentTimeMillis();
Map<String, Integer> wordsAndCountsFromFiles = null;//getWordsAndCountsFromFiles(fileNames);
// System.out.println("Getting word counts in 1 thread took " + (System.currentTimeMillis() - start) + " ms.");
// assertTrue(wordsAndCountsFromFiles != null);
// assertTrue(wordsAndCountsFromFiles.size() == numFiles + numSharedWords);
// assertTrue(wordsAndCountsFromFiles.equals(expectedWordsAndCounts));
// get word counts sequentially / single threaded \w iteratee
start = System.currentTimeMillis();
wordsAndCountsFromFiles = getWordsAndCountsFromFilesWithIteratee(fileNames, fileNameToWordsAndCountsWithCharIteratee);
System.out.println("Getting word counts in 1 thread using char iteratee took " + (System.currentTimeMillis() - start) + " ms.");
assertTrue(wordsAndCountsFromFiles != null);
assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
System.gc();
// get word counts sequentially / single threaded \w iteratee
start = System.currentTimeMillis();
wordsAndCountsFromFiles = getWordsAndCountsFromFilesWithIteratee(fileNames, fileNameToWordsAndCountsWithCharChunkIteratee);
System.out.println("Getting word counts in 1 thread using char chunk iteratee took " + (System.currentTimeMillis() - start) + " ms.");
assertTrue(wordsAndCountsFromFiles != null);
assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
System.gc();
// get word counts sequentially / single threaded \w iteratee
start = System.currentTimeMillis();
wordsAndCountsFromFiles = getWordsAndCountsFromFilesWithIteratee(fileNames, fileNameToWordsAndCountsWithCharChunk2Iteratee);
System.out.println("Getting word counts in 1 thread using char chunk2 iteratee took " + (System.currentTimeMillis() - start) + " ms.");
assertTrue(wordsAndCountsFromFiles != null);
assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
System.gc();
// start = System.currentTimeMillis();
// wordsAndCountsFromFiles = getWordsAndCountsFromFilesInParallel(fileNames, fileNameToWordsAndCounts, 8);
// System.out.println("Getting word counts in 8 threads took " + (System.currentTimeMillis() - start) + " ms.");
// assertTrue(wordsAndCountsFromFiles != null);
// assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
// assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
start = System.currentTimeMillis();
wordsAndCountsFromFiles = getWordsAndCountsFromFilesInParallel(fileNames, fileNameToWordsAndCountsWithCharIteratee, 32);
System.out.println("Getting word counts in 32 threads with char iteratee took " + (System.currentTimeMillis() - start) + " ms.");
assertTrue(wordsAndCountsFromFiles != null);
assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
System.gc();
start = System.currentTimeMillis();
wordsAndCountsFromFiles = getWordsAndCountsFromFilesInParallel(fileNames, fileNameToWordsAndCountsWithCharChunkIteratee, 32);
System.out.println("Getting word counts in 32 threads with char chunk iteratee took " + (System.currentTimeMillis() - start) + " ms.");
assertTrue(wordsAndCountsFromFiles != null);
assertEquals(wordsAndCountsFromFiles.size(), numFiles + numSharedWords);
assertEquals(wordsAndCountsFromFiles, expectedWordsAndCounts);
// we have tmpfiles, but still want to be sure not to leave rubbish
fileNames.foreachDoEffect(a -> new File(a).delete());
}
@SuppressWarnings("unused")
private static void print(Map<String, Integer> wordsAndCountsFromFiles) {
for(final Map.Entry<String, Integer> entry : wordsAndCountsFromFiles.entrySet()) {
System.out.println("Have " + entry.getKey() + ": " + entry.getValue());
}
}
private static P2<List<String>, Map<String, Integer>> writeSampleFiles(
int numFiles, int numSharedWords) throws IOException {
final Map<String, Integer> expectedWordsAndCounts = new HashMap<>();
List<String> fileNames = nil();
for(int i = 0; i < numFiles; i++) {
final File file = File.createTempFile("wordcount-"+ i + "-", ".txt");
final BufferedWriter writer = new BufferedWriter(new FileWriter(file));
writer.write("File" + i + "\n");
expectedWordsAndCounts.put("File" + i, 1);
for(int j = 0; j < numSharedWords; j++) {
writer.write("\nsomeword" + j);
expectedWordsAndCounts.put("someword" + j, numFiles);
}
writer.close();
fileNames = fileNames.cons(file.getAbsolutePath());
}
return P.p(fileNames, expectedWordsAndCounts);
}
public static Map<String, Integer> getWordsAndCountsFromFilesWithIteratee(final List<String> fileNames,
final F<String, Map<String, Integer>> fileNameToWordsAndCountsWithIteratee) {
final List<Map<String, Integer>> maps = fileNames.map(fileNameToWordsAndCountsWithIteratee);
return maps.foldLeft(WordCount::plus, new HashMap<String, Integer>());
}
public static Map<String, Integer> getWordsAndCountsFromFilesInParallel(
final List<String> fileNames, final F<String, Map<String, Integer>> fileNameToWordsAndCounts, int numThreads) {
final ExecutorService pool = newFixedThreadPool(numThreads);
final ParModule m = parModule(Strategy.executorStrategy(pool));
// Long wordCount = countWords(fileNames.map(readFile), m).claim();
final Map<String, Integer> result = getWordsAndCountsFromFiles(fileNames, fileNameToWordsAndCounts, m).claim();
pool.shutdown();
return result;
}
// Read documents and extract words and word counts of documents
public static Promise<Map<String, Integer>> getWordsAndCountsFromFiles(
final List<String> fileNames, final F<String, Map<String, Integer>> fileNameToWordsAndCounts, final ParModule m) {
final Monoid<Map<String, Integer>> monoid = monoid(WordCount::plus, Collections.emptyMap());
return m.parFoldMap(fileNames, fileNameToWordsAndCounts, monoid);
}
private static Map<String, Integer> plus(Map<String, Integer> a, Map<String, Integer> b) {
final Map<String, Integer> result = new HashMap<>(a);
for(Map.Entry<String, Integer> entry : b.entrySet()) {
final Integer num = result.get(entry.getKey());
result.put(entry.getKey(), num != null ? num.intValue() + entry.getValue() : entry.getValue());
}
return result;
}
@SuppressWarnings("unused")
private static String readFileToString(File file) throws IOException {
Reader reader = null;
try {
reader = new FileReader(file);
final Writer sw = new StringWriter((int)file.length());
copy(reader, sw);
return sw.toString();
} finally {
reader.close();
}
}
private static void copy(Reader reader, Writer writer) throws IOException {
char[] buffer = new char[1024 * 4];
int n = 0;
while (-1 != (n = reader.read(buffer))) {
writer.write(buffer, 0, n);
}
}
static void assertTrue(boolean condition) {
if (!condition) {
throw new AssertionError();
}
}
static void assertEquals(Object actual, Object expected) {
if (!expected.equals(actual)) {
throw new IllegalArgumentException("Not equals. Expected: " + expected + ", actual: " + actual);
}
}
}