/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.decomposer;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.OrthonormalityVerifier;
import org.apache.mahout.math.SparseRowMatrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.decomposer.EigenStatus;
import org.apache.mahout.math.decomposer.SimpleEigenVerifier;
import org.apache.mahout.math.decomposer.SingularVectorVerifier;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
/**
* <p>Class for taking the output of an eigendecomposition (specified as a Path location), and verifies correctness,
* in terms of the following: if you have a vector e, and a matrix m, then let e' = m.timesSquared(v); the error
* w.r.t. eigenvector-ness is the cosine of the angle between e and e':</p>
* <pre>
* error(e,e') = e.dot(e') / (e.norm(2)*e'.norm(2))
* </pre>
* <p>A set of eigenvectors should also all be very close to orthogonal, so this job computes all inner products
* between eigenvectors, and checks that this is close to the identity matrix.
* </p>
* <p>
* Parameters used in the cleanup (other than in the input/output path options) include --minEigenvalue, which
* specifies the value below which eigenvector/eigenvalue pairs will be discarded, and --maxError, which specifies
* the maximum error (as defined above) to be tolerated in an eigenvector.</p>
* <p>
* If all the eigenvectors can fit in memory, --inMemory allows for a speedier completion of this task by doing so.
* </p>
*/
public class EigenVerificationJob extends AbstractJob {
public static final String CLEAN_EIGENVECTORS = "cleanEigenvectors";
private static final Logger log = LoggerFactory.getLogger(EigenVerificationJob.class);
private SingularVectorVerifier eigenVerifier;
private VectorIterable eigensToVerify;
private VectorIterable corpus;
private double maxError;
private double minEigenValue;
//private boolean loadEigensInMemory;
private Path tmpOut;
private Path outPath;
private int maxEigensToKeep;
private Path cleanedEigensPath;
public void setEigensToVerify(VectorIterable eigens) {
eigensToVerify = eigens;
}
@Override
public int run(String[] args) throws Exception {
Map<String, String> argMap = handleArgs(args);
if (argMap == null) {
return -1;
}
if (argMap.isEmpty()) {
return 0;
}
// parse out the arguments
runJob(getConf(),
new Path(argMap.get("--eigenInput")),
new Path(argMap.get("--corpusInput")),
getOutputPath(),
argMap.get("--inMemory") != null,
Double.parseDouble(argMap.get("--maxError")),
//Double.parseDouble(argMap.get("--minEigenvalue")),
Integer.parseInt(argMap.get("--maxEigens")));
return 0;
}
/**
* Run the job with the given arguments
* @param corpusInput the corpus input Path
* @param eigenInput the eigenvector input Path
* @param output the output Path
* @param tempOut temporary output Path
* @param maxError a double representing the maximum error
* @param minEigenValue a double representing the minimum eigenvalue
* @param inMemory a boolean requesting in-memory preparation
* @param conf the Configuration to use, or null if a default is ok
* (saves referencing Configuration in calling classes unless needed)
*/
public int run(Path corpusInput,
Path eigenInput,
Path output,
Path tempOut,
double maxError,
double minEigenValue,
boolean inMemory,
Configuration conf) throws IOException {
this.outPath = output;
this.tmpOut = tempOut;
this.maxError = maxError;
this.minEigenValue = minEigenValue;
if (eigenInput != null && eigensToVerify == null) {
prepareEigens(conf, eigenInput, inMemory);
}
DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
c.setConf(conf);
corpus = c;
// set up eigenverifier and orthoverifier TODO: allow multithreaded execution
eigenVerifier = new SimpleEigenVerifier();
// we don't currently verify orthonormality here.
// VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();
Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
saveCleanEigens(new Configuration(), prunedEigenMeta);
return 0;
}
private Map<String, String> handleArgs(String[] args) throws IOException {
addOutputOption();
addOption("eigenInput",
"ei",
"The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.",
null);
addOption("corpusInput", "ci", "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>.");
addOption(DefaultOptionCreator.outputOption().create());
addOption(DefaultOptionCreator.helpOption());
addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)", "false");
addOption("maxError", "err", "Maximum acceptable error", "0.05");
addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for", "0.0");
addOption("maxEigens", "max", "Maximum number of eigenvectors to keep (0 means all)", "0");
return parseArguments(args);
}
private VectorIterable computePairwiseInnerProducts() {
return OrthonormalityVerifier.pairwiseInnerProducts(eigensToVerify);
}
private void saveCleanEigens(Configuration conf, Collection<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta)
throws IOException {
Path path = new Path(outPath, CLEAN_EIGENVECTORS);
FileSystem fs = FileSystem.get(conf);
SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
try {
IntWritable iw = new IntWritable();
int numEigensWritten = 0;
for (Map.Entry<MatrixSlice, EigenStatus> pruneSlice : prunedEigenMeta) {
MatrixSlice s = pruneSlice.getKey();
EigenStatus meta = pruneSlice.getValue();
EigenVector ev = new EigenVector(s.vector(),
meta.getEigenValue(),
Math.abs(1 - meta.getCosAngle()),
s.index());
log.info("appending {} to {}", ev, path);
Writable vw = new VectorWritable(ev);
iw.set(s.index());
seqWriter.append(iw, vw);
// increment the number of eigenvectors written and see if we've
// reached our specified limit, or if we wish to write all eigenvectors
// (latter is built-in, since numEigensWritten will always be > 0
numEigensWritten++;
if (numEigensWritten == maxEigensToKeep) {
log.info("{} of the {} total eigens have been written", maxEigensToKeep, prunedEigenMeta.size());
break;
}
}
} finally {
Closeables.closeQuietly(seqWriter);
}
cleanedEigensPath = path;
}
private List<Map.Entry<MatrixSlice, EigenStatus>> pruneEigens(Map<MatrixSlice, EigenStatus> eigenMetaData) {
List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = Lists.newArrayList();
for (Map.Entry<MatrixSlice, EigenStatus> entry : eigenMetaData.entrySet()) {
if (Math.abs(1 - entry.getValue().getCosAngle()) < maxError && entry.getValue().getEigenValue() > minEigenValue) {
prunedEigenMeta.add(entry);
}
}
Collections.sort(prunedEigenMeta, new Comparator<Map.Entry<MatrixSlice, EigenStatus>>() {
@Override
public int compare(Map.Entry<MatrixSlice,EigenStatus> e1, Map.Entry<MatrixSlice,EigenStatus> e2) {
int index1 = e1.getKey().index();
int index2 = e2.getKey().index();
if (index1 < index2) {
return -1;
}
if (index1 > index2) {
return 1;
}
return 0;
}
});
return prunedEigenMeta;
}
private Map<MatrixSlice, EigenStatus> verifyEigens() {
Map<MatrixSlice, EigenStatus> eigenMetaData = Maps.newHashMap();
for (MatrixSlice slice : eigensToVerify) {
EigenStatus status = eigenVerifier.verify(corpus, slice.vector());
eigenMetaData.put(slice, status);
}
return eigenMetaData;
}
private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) {
DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
eigens.setConf(conf);
if (inMemory) {
List<Vector> eigenVectors = Lists.newArrayList();
for (MatrixSlice slice : eigens) {
eigenVectors.add(slice.vector());
}
eigensToVerify = new SparseRowMatrix(eigenVectors.size(), eigenVectors.get(0).size(),
eigenVectors.toArray(new Vector[eigenVectors.size()]),
true,
true);
} else {
eigensToVerify = eigens;
}
}
public Path getCleanedEigensPath() {
return cleanedEigensPath;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new EigenVerificationJob(), args);
}
/**
* Progammatic invocation of run()
* @param eigenInput Output of LanczosSolver
* @param corpusInput Input of LanczosSolver
*/
public void runJob(Configuration conf,
Path eigenInput,
Path corpusInput,
Path output,
boolean inMemory,
double maxError,
int maxEigens) throws IOException {
// no need to handle command line arguments
outPath = output;
tmpOut = new Path(outPath, "tmp");
maxEigensToKeep = maxEigens;
this.maxError = maxError;
if (eigenInput != null && eigensToVerify == null) {
prepareEigens(new Configuration(conf), eigenInput, inMemory);
}
DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
c.setConf(new Configuration(conf));
corpus = c;
eigenVerifier = new SimpleEigenVerifier();
Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
saveCleanEigens(conf, prunedEigenMeta);
}
}