/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.util;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* This class implements a pseudo map-reduce framework.
* @since 1.0
* @author Laurent Jourdren
*/
public abstract class PseudoMapReduce {
/* Default Charset. */
private static final Charset CHARSET =
Charset.forName(System.getProperty("file.encoding"));
private File tmpDir;
private final List<File> listMapOutputFile = new ArrayList<>();
private File sortOutputFile;
private final LocalReporter reporter = new LocalReporter();
/**
* This class avoid storing repeated entries of a list in memory.
* @author Laurent Jourdren
*/
private static final class RepeatedEntriesList<E> extends AbstractList<E> {
private int count;
private final Map<E, Integer> map = new LinkedHashMap<>();
@Override
public E get(final int index) {
throw new UnsupportedOperationException();
}
@Override
public int size() {
return this.count;
}
@Override
public boolean add(final E e) {
if (this.map.containsKey(e)) {
final int count = this.map.get(e);
this.map.put(e, count + 1);
return true;
}
this.map.put(e, 1);
return true;
}
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private final Iterator<Map.Entry<E, Integer>> it =
RepeatedEntriesList.this.map.entrySet().iterator();
private E currentValue;
private int currentCount;
@Override
public boolean hasNext() {
return this.it.hasNext() || this.currentCount > 0;
}
@Override
public E next() {
if (this.currentCount == 0) {
if (!this.it.hasNext()) {
return null;
}
final Map.Entry<E, Integer> e = this.it.next();
this.currentValue = e.getKey();
this.currentCount = e.getValue();
}
this.currentCount--;
return this.currentValue;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public void clear() {
this.map.clear();
}
}
//
// Abstract methods
//
/**
* Mapper.
* @param value input of the mapper
* @param output List of output of the mapper
* @param reporter reporter
* @throws IOException if an error occurs while executing the mapper
*/
public abstract void map(final String value, final List<String> output,
final Reporter reporter) throws IOException;
/**
* Reducer
* @param key input key of the reducer
* @param values values for the key
* @param output list of output values of the reducer
* @param reporter reporter
* @throws IOException if an error occurs while executing the reducer
*/
public abstract void reduce(final String key, Iterator<String> values,
final List<String> output, final Reporter reporter) throws IOException;
//
// Getter
//
/**
* Get the reporter object of the Pseudo map reduce.
* @return the reporter object.
*/
public Reporter getReporter() {
return this.reporter;
}
//
// Mapper management
//
/**
* Execute the map phase with a file as input.
* @param inputFile input file for the mapper
*/
public void doMap(final File inputFile) throws IOException {
if (inputFile == null) {
throw new NullPointerException("The input file is null.");
}
doMap(FileUtils.createInputStream(inputFile));
}
protected File getMapOutputTempFile() throws IOException {
final File outputFile = File.createTempFile("map-", ".txt", this.tmpDir);
this.listMapOutputFile.add(outputFile);
return outputFile;
}
/**
* Execute the map phase with an InputStream as input Create a list of file :
* one for each index file used
* @param is input stream for the mapper
*/
public void doMap(final InputStream is) throws IOException {
if (is == null) {
throw new NullPointerException("The input stream is null.");
}
this.reporter.clear();
final BufferedReader br =
new BufferedReader(new InputStreamReader(is, CHARSET));
final UnSynchronizedBufferedWriter bw =
FileUtils.createFastBufferedWriter(getMapOutputTempFile());
final List<String> results = new ArrayList<>();
String line;
final StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
map(line, results, this.reporter);
for (String r : results) {
sb.setLength(0);
sb.append(r);
sb.append('\n');
bw.write(sb.toString());
}
results.clear();
}
br.close();
bw.close();
}
//
// Sort management
//
/**
* Set the temporary directory.
* @param directory the temporary directory
*/
public void setMapReduceTemporaryDirectory(final File directory) {
this.tmpDir = directory;
}
/**
* Get the temporary directory.
* @return the temporary directory
*/
public File getMapReduceTemporaryDirectory() {
return this.tmpDir;
}
/**
* Sort several files in sortOutputFile
* @return true if success sort
* @throws IOException
*/
private boolean sort() throws IOException {
this.sortOutputFile = File.createTempFile("sort-", ".txt", this.tmpDir);
// Create command line to execute
final List<String> command = new ArrayList<>();
command.add("sort");
// Set the temporary directory if needed
if (this.tmpDir != null) {
command.add("-T");
command.add(this.tmpDir.getAbsolutePath());
}
// Set the output file
command.add("-o");
command.add(this.sortOutputFile.getAbsolutePath());
// Set the files to sort
for (File mapOutputFile : this.listMapOutputFile) {
command.add(mapOutputFile.getAbsolutePath());
}
// Execute command
final boolean result;
try {
result = new ProcessBuilder(command).start().waitFor() == 0;
} catch (InterruptedException e) {
throw new IOException(e);
}
// Remove temporary map output files
for (File mapOutputFile : this.listMapOutputFile) {
if (!mapOutputFile.delete()) {
getLogger().warning("Can not delete map output file: "
+ mapOutputFile.getAbsolutePath());
}
}
return result;
}
//
// Reducer management
//
/**
* Execute the reduce phase with a file as output.
* @param outputFile output file for the reducer
*/
public void doReduce(final File outputFile) throws IOException {
if (outputFile == null) {
throw new NullPointerException("The output file is null.");
}
doReduce(FileUtils.createOutputStream(outputFile));
}
/**
* Execute the reduce phase with an OutputStream as output.
* @param os output stream for the reducer
*/
public void doReduce(final OutputStream os) throws IOException {
if (os == null) {
throw new NullPointerException("The output stream is null.");
}
if (!sort()) {
throw new IOException("Unable to sort/shuffle data.");
}
// Create reader
final BufferedReader br =
FileUtils.createBufferedReader(this.sortOutputFile);
// Create writer
final BufferedWriter bw =
new BufferedWriter(new OutputStreamWriter(os, CHARSET));
String line = null;
String currentKey = null;
final List<String> values = new RepeatedEntriesList<>();
final List<String> results = new ArrayList<>();
final StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
final int indexFirstTab = line.indexOf('\t');
// Do not process line
if (line.isEmpty() || indexFirstTab == -1) {
continue;
}
final String key = line.substring(0, indexFirstTab);
final String value = line.substring(indexFirstTab + 1);
if (currentKey == null) {
currentKey = key;
} else if (!key.equals(currentKey)) {
reduce(currentKey, values.iterator(), results, this.reporter);
for (String result : results) {
sb.setLength(0);
sb.append(result);
sb.append('\n');
bw.write(sb.toString());
}
results.clear();
values.clear();
currentKey = key;
}
values.add(value);
}
// Process lasts values
if (currentKey != null) {
reduce(currentKey, values.iterator(), results, this.reporter);
}
for (String result : results) {
bw.write(result);
}
br.close();
bw.close();
if (!this.sortOutputFile.delete()) {
getLogger().warning("Can not delete sort output file: "
+ this.sortOutputFile.getAbsolutePath());
}
}
}