/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
public final class TimesSquaredJob {
public static final String INPUT_VECTOR = "DistributedMatrix.times.inputVector";
public static final String IS_SPARSE_OUTPUT = "DistributedMatrix.times.outputVector.sparse";
public static final String OUTPUT_VECTOR_DIMENSION = "DistributedMatrix.times.output.dimension";
public static final String OUTPUT_VECTOR_FILENAME = "DistributedMatrix.times.outputVector";
private TimesSquaredJob() { }
public static Configuration createTimesSquaredJobConf(Vector v, Path matrixInputPath, Path outputVectorPath)
throws IOException {
return createTimesSquaredJobConf(new Configuration(), v, matrixInputPath, outputVectorPath);
}
public static Configuration createTimesSquaredJobConf(Configuration initialConf,
Vector v,
Path matrixInputPath,
Path outputVectorPath) throws IOException {
return createTimesSquaredJobConf(initialConf,
v,
matrixInputPath,
outputVectorPath,
TimesSquaredMapper.class,
VectorSummingReducer.class);
}
public static Configuration createTimesJobConf(Vector v,
int outDim,
Path matrixInputPath,
Path outputVectorPath) throws IOException {
return createTimesJobConf(new Configuration(), v, outDim, matrixInputPath, outputVectorPath);
}
public static Configuration createTimesJobConf(Configuration initialConf,
Vector v,
int outDim,
Path matrixInputPath,
Path outputVectorPath) throws IOException {
return createTimesSquaredJobConf(initialConf,
v,
outDim,
matrixInputPath,
outputVectorPath,
TimesMapper.class,
VectorSummingReducer.class);
}
public static Configuration createTimesSquaredJobConf(Vector v,
Path matrixInputPath,
Path outputVectorPathBase,
Class<? extends TimesSquaredMapper> mapClass,
Class<? extends VectorSummingReducer> redClass)
throws IOException {
return createTimesSquaredJobConf(new Configuration(), v, matrixInputPath, outputVectorPathBase, mapClass, redClass);
}
public static Configuration createTimesSquaredJobConf(Configuration initialConf,
Vector v,
Path matrixInputPath,
Path outputVectorPathBase,
Class<? extends TimesSquaredMapper> mapClass,
Class<? extends VectorSummingReducer> redClass)
throws IOException {
return createTimesSquaredJobConf(initialConf,
v,
v.size(),
matrixInputPath,
outputVectorPathBase,
mapClass,
redClass);
}
public static Configuration createTimesSquaredJobConf(Vector v,
int outputVectorDim,
Path matrixInputPath,
Path outputVectorPathBase,
Class<? extends TimesSquaredMapper> mapClass,
Class<? extends VectorSummingReducer> redClass)
throws IOException {
return createTimesSquaredJobConf(new Configuration(),
v,
outputVectorDim,
matrixInputPath,
outputVectorPathBase,
mapClass,
redClass);
}
public static Configuration createTimesSquaredJobConf(Configuration initialConf,
Vector v,
int outputVectorDim,
Path matrixInputPath,
Path outputVectorPathBase,
Class<? extends TimesSquaredMapper> mapClass,
Class<? extends VectorSummingReducer> redClass)
throws IOException {
JobConf conf = new JobConf(initialConf, TimesSquaredJob.class);
conf.setJobName("TimesSquaredJob: " + matrixInputPath);
FileSystem fs = FileSystem.get(conf);
matrixInputPath = fs.makeQualified(matrixInputPath);
outputVectorPathBase = fs.makeQualified(outputVectorPathBase);
long now = System.nanoTime();
Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);
SequenceFile.Writer inputVectorPathWriter = new SequenceFile.Writer(fs,
conf, inputVectorPath, NullWritable.class, VectorWritable.class);
Writable inputVW = new VectorWritable(v);
inputVectorPathWriter.append(NullWritable.get(), inputVW);
Closeables.close(inputVectorPathWriter, false);
URI ivpURI = inputVectorPath.toUri();
DistributedCache.setCacheFiles(new URI[] {ivpURI}, conf);
conf.set(INPUT_VECTOR, ivpURI.toString());
conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);
FileInputFormat.addInputPath(conf, matrixInputPath);
conf.setInputFormat(SequenceFileInputFormat.class);
FileOutputFormat.setOutputPath(conf, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME));
conf.setMapperClass(mapClass);
conf.setMapOutputKeyClass(NullWritable.class);
conf.setMapOutputValueClass(VectorWritable.class);
conf.setReducerClass(redClass);
conf.setCombinerClass(redClass);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setOutputKeyClass(NullWritable.class);
conf.setOutputValueClass(VectorWritable.class);
return conf;
}
public static Vector retrieveTimesSquaredOutputVector(Configuration conf) throws IOException {
Path outputPath = FileOutputFormat.getOutputPath(new JobConf(conf));
Path outputFile = new Path(outputPath, "part-00000");
SequenceFileValueIterator<VectorWritable> iterator =
new SequenceFileValueIterator<VectorWritable>(outputFile, true, conf);
try {
return iterator.next().get();
} finally {
Closeables.closeQuietly(iterator);
}
}
public static class TimesSquaredMapper<T extends WritableComparable> extends MapReduceBase
implements Mapper<T,VectorWritable, NullWritable,VectorWritable> {
private Vector outputVector;
private OutputCollector<NullWritable,VectorWritable> out;
private Vector inputVector;
Vector getOutputVector() {
return outputVector;
}
void setOut(OutputCollector<NullWritable,VectorWritable> out) {
this.out = out;
}
@Override
public void configure(JobConf conf) {
try {
URI[] localFiles = DistributedCache.getCacheFiles(conf);
Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
"missing paths from the DistributedCache");
Path inputVectorPath = new Path(localFiles[0].getPath());
SequenceFileValueIterator<VectorWritable> iterator =
new SequenceFileValueIterator<VectorWritable>(inputVectorPath, true, conf);
try {
inputVector = iterator.next().get();
} finally {
Closeables.closeQuietly(iterator);
}
int outDim = conf.getInt(OUTPUT_VECTOR_DIMENSION, Integer.MAX_VALUE);
outputVector = conf.getBoolean(IS_SPARSE_OUTPUT, false)
? new RandomAccessSparseVector(outDim, 10)
: new DenseVector(outDim);
} catch (IOException ioe) {
throw new IllegalStateException(ioe);
}
}
@Override
public void map(T rowNum,
VectorWritable v,
OutputCollector<NullWritable,VectorWritable> out,
Reporter rep) throws IOException {
setOut(out);
double d = scale(v);
if (d == 1.0) {
outputVector.assign(v.get(), Functions.PLUS);
} else if (d != 0.0) {
outputVector.assign(v.get(), Functions.plusMult(d));
}
}
protected double scale(VectorWritable v) {
return v.get().dot(inputVector);
}
@Override
public void close() throws IOException {
if (out != null) {
out.collect(NullWritable.get(), new VectorWritable(outputVector));
}
}
}
public static class TimesMapper extends TimesSquaredMapper<IntWritable> {
@Override
public void map(IntWritable rowNum,
VectorWritable v,
OutputCollector<NullWritable,VectorWritable> out,
Reporter rep) {
setOut(out);
double d = scale(v);
if (d != 0.0) {
getOutputVector().setQuick(rowNum.get(), d);
}
}
}
public static class VectorSummingReducer extends MapReduceBase
implements Reducer<NullWritable,VectorWritable,NullWritable,VectorWritable> {
private Vector outputVector;
@Override
public void configure(JobConf conf) {
int outputDimension = conf.getInt(OUTPUT_VECTOR_DIMENSION, Integer.MAX_VALUE);
outputVector = conf.getBoolean(IS_SPARSE_OUTPUT, false)
? new RandomAccessSparseVector(outputDimension, 10)
: new DenseVector(outputDimension);
}
@Override
public void reduce(NullWritable n,
Iterator<VectorWritable> vectors,
OutputCollector<NullWritable,VectorWritable> out,
Reporter reporter) throws IOException {
while (vectors.hasNext()) {
VectorWritable v = vectors.next();
if (v != null) {
outputVector.assign(v.get(), Functions.PLUS);
}
}
out.collect(NullWritable.get(), new VectorWritable(outputVector));
}
}
}