package com.skp.experiment.graph.linkanalysis;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.math.CardinalityException;
import org.apache.mahout.math.DistributedRowMatrixWriter;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.SparseMatrix;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Closeables;
import com.skp.experiment.cf.als.hadoop.ALSMatrixUtil;
import com.skp.experiment.cf.evaluate.hadoop.EvaluatorUtil;
import com.skp.experiment.cf.math.hadoop.MatrixMultiplyWithThresholdJob;
import com.skp.experiment.cf.math.hadoop.MatrixRowNormalizeJob;
import com.skp.experiment.common.DistributedRowMatrix2TextJob;
import com.skp.experiment.common.OptionParseUtil;
import com.skp.experiment.common.Text2DistributedRowMatrixJob;
import com.skp.experiment.common.join.ImprovedRepartitionJoinAndFilterJob;
import com.skp.experiment.common.mapreduce.TopKVectorMapper;
/*
* this implement A random walk algoriothm in http://www2008.org/papers/pdf/p61-fuxmanA.pdf.
* assumes that input graph is bipartite
* Bipartite graph consist of
* Left vertices L: users
* Right vertices R: items
* Edge between L-R: confidence between <u, i>. ex) rating for given item by given user
*/
public class RandomWalkOnBipartiteGraph extends AbstractJob {
private static int numItems;
private static int numUsers;
private static int iterations;
private static int topK;
private static float gamma;
private static int startIteration;
private static final String NUM_ITEMS = "numCols";
private static final String NUM_USERS = "numRows";
private static final String ITERATIONS = "iterations";
private static final String TOP_K = "topK";
private static final Logger log = LoggerFactory.getLogger(RandomWalkOnBipartiteGraph.class);
public static void main(String[] args) throws Exception {
ToolRunner.run(new RandomWalkOnBipartiteGraph(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("indexSizes", null, "path to index sizes");
addOption(ITERATIONS, "iteration", "number of iterations.");
addOption("gamma", null, "gamma", String.valueOf(0.00000001));
addOption("topK", null, "topK", String.valueOf(100));
addOption("startIteration", null, "start iterations", String.valueOf(0));
if (parseArguments(args) == null) {
return -1;
}
Map<String, String> indexSizesTmp =
ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")), OptionParseUtil.DELIMETER,
Arrays.asList(0), Arrays.asList(1));
numUsers = Integer.parseInt(indexSizesTmp.get("0"));
numItems = Integer.parseInt(indexSizesTmp.get("1"));
iterations = Integer.parseInt(getOption(ITERATIONS));
gamma = Float.parseFloat(getOption("gamma"));
topK = Integer.parseInt(getOption("topK"));
startIteration = Integer.parseInt(getOption("startIteration"));
// 1. convert csv format into distributed row matrix form
log.info("convert csv into distributed row matrix");
ToolRunner.run(getConf(), new Text2DistributedRowMatrixJob(), new String[] {
"--input", getInputPath().toString(), "--output", pathToInitialEdges().toString(),
"--rowidx", "0", "--colidx", "1", "--valueidx", "2", "--numCols", Integer.toString(numItems)
});
ToolRunner.run(getConf(), new Text2DistributedRowMatrixJob(), new String[] {
"--input", getInputPath().toString(), "--output", pathToInitialEdgesTranspose().toString(),
"--rowidx", "1", "--colidx", "0", "--valueidx", "2", "--numCols", Integer.toString(numUsers)
});
log.info("build normalized edges.");
// 2. normalize each matrix by row
ToolRunner.run(getConf(), new MatrixRowNormalizeJob(), new String[]{
"--input", pathToInitialEdges().toString(), "--output", pathToRowNormEdges().toString()
});
ToolRunner.run(getConf(), new MatrixRowNormalizeJob(), new String[] {
"--input", pathToInitialEdgesTranspose().toString(), "--output", pathToRowNormEdgesTranspose().toString()
});
DistributedRowMatrix UIRowNorm =
new DistributedRowMatrix(pathToRowNormEdges(), getTempPath("tmp"), numUsers, numItems);
UIRowNorm.setConf(getConf());
DistributedRowMatrix IUNorm = UIRowNorm.transpose();
DistributedRowMatrix IURowNorm =
new DistributedRowMatrix(pathToRowNormEdgesTranspose(), getTempPath("tmp"), numItems, numUsers);
IURowNorm.setConf(getConf());
DistributedRowMatrix UINorm = IURowNorm.transpose();
log.info("before for loop");
//System.out.println("print UINorm");
//printDistributedRowMatrix(UINorm, getTempPath("UINormConv"));
//System.out.println("print IUNorm");
//printDistributedRowMatrix(IUNorm, getTempPath("IUNormConv"));
log.info("initialize class");
DistributedRowMatrix initialClass = createDigonalClass(numItems, getTempPath("initial.class"), getConf());
//printDistributedRowMatrix(initialClass, getTempPath("initial.class.conv"));
DistributedRowMatrix CItranspose = initialClass.transpose();
//DistributedRowMatrix CUtranspose = IUNorm.transpose();
DistributedRowMatrix CUtranspose = null;
for (int iteration = startIteration; iteration < iterations; iteration++) {
log.info("current iteration: {}", iteration);
//System.out.println("CU: " + CUtranspose.numCols() + "\t" + CUtranspose.numRows());
//System.out.println("CI: " + CItranspose.numCols() + "\t" + CItranspose.numRows());
CUtranspose = timesWithThreshold(CItranspose, IUNorm, pathToProbsCU(iteration), gamma);
CUtranspose.setConf(getConf());
CUtranspose = CUtranspose.transpose();
//printDistributedRowMatrix(CUtranspose, getTempPath("CU.transpose.conv." + Integer.toString(iteration)));
//System.out.println("CU: " + CUtranspose.numCols() + "\t" + CUtranspose.numRows());
CItranspose = timesWithThreshold(CUtranspose, UINorm, pathToProbsCI(iteration), gamma);
//printDistributedRowMatrix(CItranspose, getTempPath("CI.transpose.conv." + Integer.toString(iteration)));
}
//printDistributedRowMatrix(CUtranspose, getOutputPath());
if (retrieveTopKPerUser(CUtranspose.getRowPath(), getTempPath("output")) != 0) {
return -1;
}
if (retrieveTopKPerUser(CItranspose.getRowPath(), new Path(getOutputPath().toString() + "_inv")) != 0) {
return -1;
}
if (appendIsDirectFlag(getInputPath(), getTempPath("output"), getOutputPath()) != 0) {
return -1;
}
return 0;
}
@SuppressWarnings("deprecation")
public DistributedRowMatrix timesWithThreshold(DistributedRowMatrix src, DistributedRowMatrix other, Path output, float threshold)
throws IOException, InterruptedException, ClassNotFoundException {
if (src.numRows() != other.numRows()) {
throw new CardinalityException(src.numRows(), other.numRows());
}
// multiply
Configuration initialConf = new Configuration(getConf());
initialConf.set("mapred.child.java.opts", "-Xmx4g");
initialConf.setLong("mapred.task.timeout", 600000 * 10);
JobConf conf =
MatrixMultiplyWithThresholdJob.createMatrixMultiplyWithThresholdJob(initialConf, src.getRowPath(), other.getRowPath(),
output, other.numCols(), threshold);
JobClient.runJob(conf);
DistributedRowMatrix out = new DistributedRowMatrix(output, getTempPath("tmp"), src.numCols(), other.numCols());
out.setConf(conf);
return out;
}
/* */
private int indexVertices(Path verticesPath, Path indexPath) throws IOException {
FileSystem fs = FileSystem.get(verticesPath.toUri(), getConf());
SequenceFile.Writer writer = null;
int index = 0;
try {
writer = SequenceFile.createWriter(fs, getConf(), indexPath, IntWritable.class, IntWritable.class);
for (FileStatus fileStatus : fs.listStatus(verticesPath)) {
InputStream in = null;
try {
in = HadoopUtil.openStream(fileStatus.getPath(), getConf());
for (String line : new FileLineIterable(in)) {
writer.append(new IntWritable(index++), new IntWritable(Integer.parseInt(line)));
}
} finally {
Closeables.closeQuietly(in);
}
}
} finally {
Closeables.closeQuietly(writer);
}
return index;
}
//TODO: override this for various concepts
private DistributedRowMatrix retrieveClassMatrix(Path classMatrixPath) {
return null;
}
/* this method returns diagonal matrix with numItems x numItems cardinality */
private DistributedRowMatrix createDigonalClass(int numItems, Path digonalMatrixPath, Configuration conf) throws IOException {
Matrix digonalMatrix = new SparseMatrix(numItems, numItems);
for (int idx = 0; idx < numItems; idx++) {
digonalMatrix.set(idx, idx, 1.0);
}
DistributedRowMatrixWriter.write(digonalMatrixPath, conf, digonalMatrix);
DistributedRowMatrix result = new DistributedRowMatrix(digonalMatrixPath,
getTempPath("digonalClass"),
numItems,
numItems);
result.setConf(conf);
return result;
}
private int retrieveTopKPerUser(Path input, Path output)
throws IOException, InterruptedException, ClassNotFoundException {
Job job = prepareJob(input, output, SequenceFileInputFormat.class,
TopKVectorMapper.class, NullWritable.class, Text.class,
TextOutputFormat.class);
job.setJarByClass(RandomWalkOnBipartiteGraph.class);
job.setJobName("Retrieve Top K Per User.");
job.getConfiguration().setInt(TopKVectorMapper.TOP_K, topK);
job.setNumReduceTasks(0);
if (!job.waitForCompletion(true)) {
return -1;
}
return 0;
}
private int appendIsDirectFlag(Path train, Path tmpOutput, Path output) throws Exception {
ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), new String[]{
"-i", tmpOutput.toString(), "-o", output.toString(),
"-sidx", "0,1", "-tgt", train.toString() + ":0,1:0,1:2:outer",
"--defaultValue", "-1"
});
return 0;
}
public Path pathToInitialEdges() {
return getTempPath("initial.edges");
}
public Path pathToInitialEdgesTranspose() {
return getTempPath("initial.edges.transpose");
}
public Path pathToRowNormEdges() {
return getTempPath("row.norm");
}
public Path pathToRowNormEdgesTranspose() {
return getTempPath("row.norm.transpose");
}
public Path pathToProbsCU(int n) {
return getTempPath("prob.CU." + Integer.toString(n));
}
public Path pathToProbsCI(int n) {
return getTempPath("prob.CI." + Integer.toString(n));
}
public Path pathToUINormConv() {
return getTempPath("matrix.UINorm");
}
public Path pathToIUNormConv() {
return getTempPath("matrix.IUNorm");
}
}