package com.skp.experiment.cf.als.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.mahout.cf.taste.common.TopK;
import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem;
import org.apache.mahout.cf.taste.recommender.RecommendedItem;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SparseMatrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.map.OpenHashMap;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import com.google.common.primitives.Floats;
import com.skp.experiment.common.OptionParseUtil;
/**
* This is helper utils for DistributedRowMatrix. mainly stolen from mahout.
* @author doyoung
*
*/
public class ALSMatrixUtil {
/**
* read first row of matrix as Vector.
* @param dir, HDFS path for DistributedRowMatrix(<IntWritable, VectorWritable> SequenceFile) reside on.
* @param conf, Job Configuration
* @return Vector, first row of DistributedRowMatrix without IntWritable(key).
* @throws IOException
*/
public static Vector readFirstRow(Path dir, Configuration conf) throws IOException {
Iterator<VectorWritable> iterator = new SequenceFileDirValueIterator<VectorWritable>(dir,
PathType.LIST,
PathFilters.partFilter(),
null,
true,
conf);
return iterator.hasNext() ? iterator.next().get() : null;
}
/**
* read entire matrix into memory using row_id as key.
*/
@SuppressWarnings("rawtypes")
public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Context ctx) {
return readMatrixByRows(dir, ctx.getConfiguration());
}
/**
* read entire matrix into memory using row_id as key from Mapper`s setup().
* caution, caller of this usually in Mapper, so when source matrix is large map task usually runs out of heap space.
* to address this make sure caller force only proper number of map task instance runs simultaneously on each datanode.
* (I have tried file lock on HDFS since it is easy and good enough for my needs but will try with Zookeeper later)
* @param dir, HDFS path for DistributedRowMatrix.
* @param conf, Configuration of this cluster.
* @return Hash<Integer, Vector>, whole file based DistributedRowMatrix rows.key is row_id
*/
public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) {
OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<Vector>();
for (Pair<IntWritable,VectorWritable> pair :
new SequenceFileDirIterable<IntWritable,VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
int rowIndex = pair.getFirst().get();
Vector row = pair.getSecond().get().clone();
matrix.put(rowIndex, row);
}
return matrix;
}
private static class MatrixLoaderThread extends Thread {
public static Matrix matrix;
private SequenceFileIterator<IntWritable, VectorWritable> iter;
MatrixLoaderThread(SequenceFileIterator<IntWritable, VectorWritable> i) throws IOException {
iter = i;
}
@Override
public void run() {
while (iter.hasNext()) {
Pair<IntWritable, VectorWritable> pair = iter.next();
int rowId = pair.getFirst().get();
Iterator<Vector.Element> cols = pair.getSecond().get().iterateNonZero();
while (cols.hasNext()) {
Vector.Element col = cols.next();
matrix.set(rowId, col.index(), col.get());
}
}
}
}
public static Matrix readMatrixByRowsMultithred(Path dir, Configuration conf, int numRows, int numCols)
throws IOException, InterruptedException {
Matrix matrix = new DenseMatrix(numRows, numCols);
MatrixLoaderThread.matrix = matrix;
List<MatrixLoaderThread> runners = new ArrayList<MatrixLoaderThread>();
FileStatus[] status = FileSystem.get(conf).globStatus(new Path(dir.toString() + "/part*"));
for (FileStatus file : status) {
SequenceFileIterator<IntWritable, VectorWritable> iter =
new SequenceFileIterator<IntWritable, VectorWritable>(file.getPath(), true, conf);
MatrixLoaderThread thread = new MatrixLoaderThread(iter);
thread.start();
runners.add(thread);
}
for (MatrixLoaderThread thread : runners) {
thread.join();
}
return matrix;
}
public static Matrix conv2Matrix(OpenHashMap<Integer, Vector> x, int rowNums, int colNums) {
Matrix matrix = new DenseMatrix(rowNums, colNums);
for (Integer rowID : x.keySet()) {
matrix.assignRow(rowID, x.get(rowID));
}
return matrix;
}
/**
* read entire matrix into memory using row_id as key from Mapper`s setup().
* use this when caller knows dimention of source matrix.
*/
public static DenseMatrix readDenseMatrixByRows(Path dir, Context ctx, int numRows, int numCols) {
return readDenseMatrixByRows(dir, ctx.getConfiguration(), numRows, numCols);
}
public static DenseMatrix readDenseMatrixByRows(Path dir, Configuration conf, int numRows, int numCols) {
DenseMatrix matrix = new DenseMatrix(numRows, numCols);
for (Pair<IntWritable, VectorWritable> pair:
new SequenceFileDirIterable<IntWritable, VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
int rowIndex = pair.getFirst().get();
Vector row = pair.getSecond().get().clone();
matrix.assignRow(rowIndex, row);
}
return matrix;
}
/**
* Comparator class to sort RecommendedItem by score
*/
public static final Comparator<RecommendedItem> BY_PREFERENCE_VALUE =
new Comparator<RecommendedItem>() {
@Override
public int compare(RecommendedItem one, RecommendedItem two) {
return Floats.compare(one.getValue(), two.getValue());
}
};
/**
* extract topK RecommendedItem sorted by BY_PREFERENCE_VALUE
* @param dir, HDFS path for DistributedRowMatrix
* @param ctx, job`s Context
* @param topK, topK items will be kept in priority_queue
* @return Map<Integer, TopK<RecommendedItem> >, topK score items per each row_id
*/
@SuppressWarnings("rawtypes")
public static OpenIntObjectHashMap<TopK<RecommendedItem>> readMatrixByRowsTopK(Path dir, Context ctx, int topK) {
return readMatrixByRowsTopK(dir, ctx.getConfiguration(), topK);
}
public static OpenIntObjectHashMap<TopK<RecommendedItem>> readMatrixByRowsTopK(Path dir, Configuration conf, int topK) {
OpenIntObjectHashMap<TopK<RecommendedItem>> matrix = new OpenIntObjectHashMap<TopK<RecommendedItem>>();
for (Pair<IntWritable, VectorWritable> pair :
new SequenceFileDirIterable<IntWritable, VectorWritable>(new Path(dir.toString() + "/^[^_]*"), PathType.GLOB, null, conf)) {
TopK<RecommendedItem> topKItems = new TopK<RecommendedItem>(topK, BY_PREFERENCE_VALUE);
int rowIndex = pair.getFirst().get();
Vector rowVector = pair.getSecond().get().clone();
Iterator<Vector.Element> cols = rowVector.iterateNonZero();
while (cols.hasNext()) {
Vector.Element e = cols.next();
topKItems.offer(new GenericRecommendedItem(e.index(), (float)e.get()));
}
matrix.put(rowIndex, topKItems);
}
return matrix;
}
public static Matrix readDistributedRowMatrix(Path dir, Configuration conf, int numRows, int numCols) {
Matrix result = new DenseMatrix(numRows, numCols);
DistributedRowMatrix X = new DistributedRowMatrix(dir, new Path("/tmp/multiply"), numRows, numCols);
X.setConf(conf);
Iterator<MatrixSlice> rows = X.iterator();
while (rows.hasNext()) {
MatrixSlice row = rows.next();
result.assignRow(row.index(), row.vector());
}
return result;
}
public static Matrix readDistributedRowMatrix(Path dir, int numRows, int numCols) {
return readDistributedRowMatrix(dir, new Configuration(), numRows, numCols);
}
/**
* fetch key value hashmap using column index list as key, value
* @param conf
* @param input
* @param delimeter
* @param keyIdxs
* @param valueIdxs
* @return
* @throws IOException
*/
public static Map<String, String> fetchTextFiles(Configuration conf, Path input, String delimeter,
List<Integer> keyIdxs, List<Integer> valueIdxs) throws IOException {
Map<String, String> caches = new HashMap<String, String>();
// read target file.
FileSystem fs = FileSystem.get(conf);
if (fs.isFile(input)) {
fetchTextFile(fs.open(input), delimeter, keyIdxs, valueIdxs, caches);
} else {
FileStatus[] files = fs.globStatus(new Path(input.toString() + "/^[^_]*"));
for (FileStatus file : files) {
fetchTextFile(fs.open(file.getPath()), delimeter, keyIdxs, valueIdxs, caches);
}
}
return caches;
}
public static void fetchTextFile(FSDataInputStream in, String delimeter,
List<Integer> keyIdxs, List<Integer> valueIdxs, Map<String, String> result) throws IOException {
FileLineIterator iter = new FileLineIterator(in);
while (iter.hasNext()) {
String line = iter.next();
String[] tokens = line.split(delimeter);
// record target key, value
String key = OptionParseUtil.encode(tokens, keyIdxs, delimeter);
String value = OptionParseUtil.encode(tokens, valueIdxs, delimeter);
result.put(key, value);
}
}
@SuppressWarnings("rawtypes")
public static Map<String, String> fetchTextFiles(Context ctx, Path input, String delimeter,
List<Integer> keyIdxs, List<Integer> valueIdxs) throws IOException {
return fetchTextFiles(ctx.getConfiguration(), input, delimeter, keyIdxs, valueIdxs);
}
public static Map<String, String> fetchTextFiles(Path input, String delimeter,
List<Integer> keyIdxs, List<Integer> valueIdxs) throws IOException {
return fetchTextFiles(new Configuration(), input, delimeter, keyIdxs, valueIdxs);
}
}